auto1-oss · ernest-gonzales · Mar 31, 2026 · Mar 23, 2026
diff --git a/pr_agent/algo/context_enrichment.py b/pr_agent/algo/context_enrichment.py
@@ -27,6 +27,31 @@ def append_small_file_context_to_diff(
     return f"{diff_text}\n\nAdditional context for small touched files:\n======\n{context}\n======"
 
 
+def append_small_file_context_to_diffs(
+    diff_texts: list[str],
+    diff_files: list[FilePatchInfo],
+    token_handler,
+    *,
+    max_lines: int,
+    max_tokens: int,
+) -> list[str]:
+    if not diff_texts:
+        return diff_texts
+
+    return [
+        append_small_file_context_to_diff(
+            diff_text,
+            diff_files,
+            token_handler,
+            max_lines=max_lines,
+            max_tokens=max_tokens,
+        )
+        if diff_text
+        else diff_text
+        for diff_text in diff_texts
+    ]
+
+
 def build_small_file_context(
     diff_text: str,
     diff_files: list[FilePatchInfo],

diff --git a/pr_agent/algo/suggestion_output_filter.py b/pr_agent/algo/suggestion_output_filter.py
@@ -0,0 +1,116 @@
+import copy
+import logging
+
+
+VALID_CONFIDENCE_VALUES = {"high", "medium", "low"}
+VALID_EVIDENCE_TYPES = {"diff", "ticket", "inferred"}
+FINDINGS_FILTER_MODE_OFF = "off"
+FINDINGS_FILTER_MODE_DROP_LOW_CONFIDENCE_INFERRED = "drop_low_confidence_inferred"
+LOGGER = logging.getLogger(__name__)
+
+
+def normalize_code_suggestions_output(
+    data: dict,
+    findings_metadata: bool = False,
+    filter_mode: str = FINDINGS_FILTER_MODE_OFF,
+) -> dict:
+    if not isinstance(data, dict) or "code_suggestions" not in data or not isinstance(data["code_suggestions"], list):
+        return data
+
+    if filter_mode != FINDINGS_FILTER_MODE_OFF and not findings_metadata:
+        _log_warning("findings_filter_mode has no effect when findings_metadata is false")
+        filter_mode = FINDINGS_FILTER_MODE_OFF
+    elif filter_mode not in {FINDINGS_FILTER_MODE_OFF, FINDINGS_FILTER_MODE_DROP_LOW_CONFIDENCE_INFERRED}:
+        _log_warning(f"Unknown findings_filter_mode: {filter_mode!r}, defaulting to off")
+        filter_mode = FINDINGS_FILTER_MODE_OFF
+
+    normalized_data = copy.deepcopy(data)
+    suggestions = normalized_data.get("code_suggestions")
+    if not isinstance(suggestions, list):
+        return normalized_data
+
+    normalized_suggestions = []
+    for suggestion in suggestions:
+        normalized_suggestion = normalize_suggestion(suggestion, findings_metadata=findings_metadata)
+        if normalized_suggestion:
+            normalized_suggestions.append(normalized_suggestion)
+
+    filtered_suggestions, suppressed_count = filter_suggestions(normalized_suggestions, filter_mode=filter_mode)
+    if suppressed_count:
+        _log_debug("Suppressed low-confidence inferred code suggestions", artifact={"count": suppressed_count})
+    normalized_data["code_suggestions"] = filtered_suggestions
+
+    return normalized_data
+
+
+def normalize_suggestion(suggestion: dict, findings_metadata: bool = False) -> dict | None:
+    if not isinstance(suggestion, dict):
+        return None
+
+    normalized_suggestion = copy.deepcopy(suggestion)
+
+    if findings_metadata:
+        confidence = normalize_enum_value(suggestion.get("confidence"), VALID_CONFIDENCE_VALUES)
+        if confidence:
+            normalized_suggestion["confidence"] = confidence
+        else:
+            normalized_suggestion.pop("confidence", None)
+
+        evidence_type = normalize_enum_value(suggestion.get("evidence_type"), VALID_EVIDENCE_TYPES)
+        if evidence_type:
+            normalized_suggestion["evidence_type"] = evidence_type
+        else:
+            normalized_suggestion.pop("evidence_type", None)
+    else:
+        normalized_suggestion.pop("confidence", None)
+        normalized_suggestion.pop("evidence_type", None)
+
+    return normalized_suggestion
+
+
+def normalize_enum_value(value, valid_values: set[str]) -> str | None:
+    if value is None:
+        return None
+
+    normalized_value = str(value).strip().lower()
+    if normalized_value in valid_values:
+        return normalized_value
+
+    return None
+
+
+def filter_suggestions(suggestions: list[dict], filter_mode: str = FINDINGS_FILTER_MODE_OFF) -> tuple[list[dict], int]:
+    if filter_mode != FINDINGS_FILTER_MODE_DROP_LOW_CONFIDENCE_INFERRED:
+        return suggestions, 0
+
+    filtered_suggestions = []
+    suppressed_count = 0
+    for suggestion in suggestions:
+        if suggestion.get("confidence") == "low" and suggestion.get("evidence_type") == "inferred":
+            suppressed_count += 1
+            continue
+        filtered_suggestions.append(suggestion)
+
+    return filtered_suggestions, suppressed_count
+
+
+def _log_warning(message: str) -> None:
+    logger = _resolve_logger()
+    logger.warning(message)
+
+
+def _log_debug(message: str, artifact: dict) -> None:
+    logger = _resolve_logger()
+    try:
+        logger.debug(message, artifact=artifact)
+    except TypeError:
+        logger.debug(message, extra={"artifact": artifact})
+
+
+def _resolve_logger():
+    try:
+        from pr_agent.log import get_logger
+
+        return get_logger()
+    except Exception:
+        return LOGGER
diff --git a/pr_agent/algo/utils.py b/pr_agent/algo/utils.py
@@ -380,21 +380,43 @@ def format_review_issue_metadata(issue: dict, gfm_supported: bool, enable_badges
     return " ".join([f"`{label}`" for label in metadata_labels])
 
 
-def format_code_suggestion_metadata(label: str, score=None, enable_badges: bool | None = None) -> str:
+def format_code_suggestion_metadata(label: str, score=None, confidence: str | None = None,
+                                    evidence_type: str | None = None,
+                                    enable_badges: bool | None = None) -> str:
     if enable_badges is None:
-        enable_badges = get_settings().pr_reviewer.get("findings_metadata_badges", False)
+        enable_badges = get_settings().pr_code_suggestions.get(
+            "findings_metadata_badges",
+            get_settings().pr_reviewer.get("findings_metadata_badges", False),
+        )
     if not enable_badges:
         return ""
 
+    metadata_labels = []
     label = str(label).strip()
-    if not label:
-        return ""
+    if label:
+        metadata_labels.append(label)
 
     score_value = "" if score is None else str(score).strip()
     if score_value:
-        return f" [{label}, importance: {score_value}]"
+        metadata_labels.append(f"importance: {score_value}")
+
+    normalized_confidence = str(confidence or "").strip().lower()
+    if normalized_confidence in {"high", "medium", "low"}:
+        metadata_labels.append(f"{normalized_confidence.capitalize()} confidence")
+
+    normalized_evidence_type = str(evidence_type or "").strip().lower()
+    evidence_labels = {
+        "diff": "Diff evidence",
+        "ticket": "Ticket evidence",
+        "inferred": "Inferred evidence",
+    }
+    if normalized_evidence_type in evidence_labels:
+        metadata_labels.append(evidence_labels[normalized_evidence_type])
+
+    if not metadata_labels:
+        return ""
 
-    return f" [{label}]"
+    return f" [{', '.join(metadata_labels)}]"
 
 
 def extract_relevant_lines_str(end_line, files, relevant_file, start_line, dedent=False) -> str:

diff --git a/pr_agent/settings/code_suggestions/pr_code_suggestions_prompts.toml b/pr_agent/settings/code_suggestions/pr_code_suggestions_prompts.toml
@@ -72,6 +72,16 @@ Specific guidelines for generating code suggestions:
 {%- endif %}
 - Be aware that your input consists only of partial code segments (PR diff code), not the complete codebase. Therefore, avoid making suggestions that might duplicate existing functionality, and refrain from questioning code elements (such as variable declarations or import statements) that may be defined elsewhere in the codebase.
 - When mentioning code elements (variables, names, or files) in your response, surround them with backticks (`). For example: "verify that `user_id` is..."
+{%- if small_file_context %}
+- Some files may include a `## Full file context:` section with the complete post-change contents of a small touched file. Use this only to validate suggestions related to the changed file, and do not report issues from untouched regions unless they are necessary to explain a defect introduced by the PR.
+{%- endif %}
+{%- if findings_metadata %}
+- For each suggestion, set `confidence` from `high`, `medium`, or `low` based only on the visible evidence.
+- Set `evidence_type` to `diff` when changed lines directly support the suggestion, `ticket` for a clear ticket mismatch, or `inferred` when the suggestion depends on indirect clues.
+{%- endif %}
+- For lower-severity concerns, be certain before suggesting. If you cannot confidently explain why something is a problem with a concrete scenario, do not suggest it.
+- Do not speculate that a change might break other code unless you can identify the specific affected code path from the visible diff context.
+- When confidence is limited but the potential impact is high, make the uncertainty explicit. Otherwise, prefer returning fewer suggestions over guessing.
 
 {%- if extra_instructions %}
 
@@ -92,6 +102,10 @@ class CodeSuggestion(BaseModel):
     suggestion_content: str = Field(description="An actionable suggestion to enhance, improve or fix the new code introduced in the PR. Don't present here actual code snippets, just the suggestion. Be short and concise")
     improved_code: str = Field(description="A refined code snippet that replaces the 'existing_code' snippet after implementing the suggestion.")
     one_sentence_summary: str = Field(description="A concise, single-sentence overview (up to 6 words) of the suggested improvement. Focus on the 'what'. Be general, and avoid method or variable names.")
+{%- if findings_metadata %}
+    confidence: str = Field(description="Your confidence in this suggestion based only on the visible evidence. Allowed values: high, medium, low")
+    evidence_type: str = Field(description="What directly supports this suggestion. Allowed values: diff, ticket, inferred")
+{%- endif %}
 {%- if not focus_only_on_problems %}
     label: str = Field(description="A single, descriptive label that best characterizes the suggestion type. Possible labels include 'security', 'possible bug', 'possible issue', 'performance', 'enhancement', 'best practice', 'maintainability', 'typo'. Other relevant labels are also acceptable.")
 {%- else %}
@@ -119,6 +133,12 @@ code_suggestions:
     ...
   one_sentence_summary: |
     ...
+{%- if findings_metadata %}
+  confidence: |
+    high
+  evidence_type: |
+    diff
+{%- endif %}
   label: |
     ...
 ```
@@ -158,6 +178,12 @@ code_suggestions:
     ...
   one_sentence_summary: |
     ...
+{%- if findings_metadata %}
+  confidence: |
+    high
+  evidence_type: |
+    diff
+{%- endif %}
   label: |
     ...
 ```

diff --git a/pr_agent/settings/code_suggestions/pr_code_suggestions_prompts_not_decoupled.toml b/pr_agent/settings/code_suggestions/pr_code_suggestions_prompts_not_decoupled.toml
@@ -61,6 +61,16 @@ Specific guidelines for generating code suggestions:
 - When mentioning code elements (variables, names, or files) in your response, surround them with markdown backticks (`). For example: "verify that `user_id` is..."
 - Note that you will only see partial code segments that were changed (diff hunks in a PR code), and not the entire codebase. Avoid suggestions that might duplicate existing functionality of the outer codebase. In addition, the absence of a definition, declaration, import, or initialization for any entity in the PR code is NEVER a basis for a suggestion.
 - Also note that if the code ends at an opening brace or statement that begins a new scope (like 'if', 'for', 'try'), don't treat it as incomplete. Instead, acknowledge the visible scope boundary and analyze only the code shown.
+{%- if small_file_context %}
+- Some files may include a `## Full file context:` section with the complete post-change contents of a small touched file. Use this only to validate suggestions related to the changed file, and do not report issues from untouched regions unless they are necessary to explain a defect introduced by the PR.
+{%- endif %}
+{%- if findings_metadata %}
+- For each suggestion, set `confidence` from `high`, `medium`, or `low` based only on the visible evidence.
+- Set `evidence_type` to `diff` when changed lines directly support the suggestion, `ticket` for a clear ticket mismatch, or `inferred` when the suggestion depends on indirect clues.
+{%- endif %}
+- For lower-severity concerns, be certain before suggesting. If you cannot confidently explain why something is a problem with a concrete scenario, do not suggest it.
+- Do not speculate that a change might break other code unless you can identify the specific affected code path from the visible diff context.
+- When confidence is limited but the potential impact is high, make the uncertainty explicit. Otherwise, prefer returning fewer suggestions over guessing.
 
 {%- if extra_instructions %}
 
@@ -81,6 +91,10 @@ class CodeSuggestion(BaseModel):
     suggestion_content: str = Field(description="An actionable suggestion to enhance, improve or fix the new code introduced in the PR. Use 2-3 short sentences.")
     improved_code: str = Field(description="A refined code snippet that replaces the 'existing_code' snippet after implementing the suggestion.")
     one_sentence_summary: str = Field(description="A single-sentence overview (up to 6 words) of the suggestion. Focus on the 'what'. Be general, and avoid mentioning method or variable names.")
+{%- if findings_metadata %}
+    confidence: str = Field(description="Your confidence in this suggestion based only on the visible evidence. Allowed values: high, medium, low")
+    evidence_type: str = Field(description="What directly supports this suggestion. Allowed values: diff, ticket, inferred")
+{%- endif %}
 {%- if not focus_only_on_problems %}
     label: str = Field(description="A single, descriptive label that best characterizes the suggestion type. Possible labels include 'security', 'possible bug', 'possible issue', 'performance', 'enhancement', 'best practice', 'maintainability', 'typo'. Other relevant labels are also acceptable.")
 {%- else %}
@@ -108,6 +122,12 @@ code_suggestions:
     ...
   one_sentence_summary: |
     ...
+{%- if findings_metadata %}
+  confidence: |
+    high
+  evidence_type: |
+    diff
+{%- endif %}
   label: |
     ...
 ```
@@ -147,6 +167,12 @@ code_suggestions:
     ...
   one_sentence_summary: |
     ...
+{%- if findings_metadata %}
+  confidence: |
+    high
+  evidence_type: |
+    diff
+{%- endif %}
   label: |
     ...
 ```

diff --git a/pr_agent/settings/code_suggestions/pr_code_suggestions_reflect_prompts.toml b/pr_agent/settings/code_suggestions/pr_code_suggestions_reflect_prompts.toml
@@ -22,6 +22,7 @@ Key guidelines for evaluation:
 - Extend your review beyond the specifically mentioned code lines to encompass surrounding PR code context, verifying the suggestions' contextual accuracy.
 - Validate the 'existing_code' field by confirming it matches or is accurately derived from code lines within a '__new hunk__' section of the PR code diff.
 - Ensure the 'improved_code' section accurately reflects the 'existing_code' segment after the suggested modification is applied.
+- If a suggestion depends on indirect clues rather than directly visible evidence, lower confidence accordingly. Suggestions with weak inferred evidence should receive a low score.
 - Apply a nuanced scoring system:
   - Reserve high scores (9-10) for critical issues such as major bugs or security concerns (merge-blocking, won't compile,
     deterministic test failures, common-path crash, data loss/corruption, or clear contract breakage).
@@ -82,6 +83,13 @@ __old hunk__
 {%- if is_ai_metadata %}
 - When available, an AI-generated summary will precede each file's diff, with a high-level overview of the changes. Note that this summary may not be fully accurate or comprehensive.
 {%- endif %}
+{%- if small_file_context %}
+- Some files may include a `## Full file context:` section with the complete post-change contents of a small touched file. Use this only to validate suggestions related to the changed file, and do not report issues from untouched regions unless they are necessary to explain a defect introduced by the PR.
+{%- endif %}
+{%- if findings_metadata %}
+- For each suggestion, set `confidence` from `high`, `medium`, or `low` based only on the visible evidence.
+- Set `evidence_type` to `diff` when changed lines directly support the suggestion, `ticket` for a clear ticket mismatch, or `inferred` when the suggestion depends on indirect clues.
+{%- endif %}
 
 
 The output must be a YAML object equivalent to type $PRCodeSuggestionsFeedback, according to the following Pydantic definitions:
@@ -92,6 +100,10 @@ class CodeSuggestionFeedback(BaseModel):
     relevant_lines_start: int = Field(description="The relevant line number, from a '__new hunk__' section, where the suggestion starts (inclusive). Should be derived from the added '__new hunk__' line numbers, and correspond to the first line of the relevant 'existing code' snippet.")
     relevant_lines_end: int = Field(description="The relevant line number, from a '__new hunk__' section, where the suggestion ends (inclusive). Should be derived from the added '__new hunk__' line numbers, and correspond to the end of the relevant 'existing code' snippet")
     suggestion_score: int = Field(description="Evaluate the suggestion and assign a score from 0 to 10. Give 0 if the suggestion is wrong. For valid suggestions, score from 1 (lowest impact/importance) to 10 (highest impact/importance).")
+{%- if findings_metadata %}
+    confidence: str = Field(description="Your confidence in this suggestion based only on the visible evidence. Allowed values: high, medium, low")
+    evidence_type: str = Field(description="What directly supports this suggestion. Allowed values: diff, ticket, inferred")
+{%- endif %}
     why: str = Field(description="Briefly explain the score given in 1-2 short sentences, focusing on the suggestion's impact, relevance, and accuracy. When mentioning code elements (variables, names, or files) in your response, surround them with markdown backticks (`).")
 
 class PRCodeSuggestionsFeedback(BaseModel):
@@ -108,6 +120,12 @@ code_suggestions:
   relevant_lines_start: 13
   relevant_lines_end: 14
   suggestion_score: 6
+{%- if findings_metadata %}
+  confidence: |
+    medium
+  evidence_type: |
+    diff
+{%- endif %}
   why: |
     The variable name 't' is not descriptive enough
 - ...
@@ -141,6 +159,12 @@ code_suggestions:
   relevant_lines_start: ...
   relevant_lines_end: ...
   suggestion_score: ...
+{%- if findings_metadata %}
+  confidence: |
+    ...
+  evidence_type: |
+    ...
+{%- endif %}
   why: |
     ...
 - ...

diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml
@@ -138,6 +138,12 @@ use_conversation_history=true
 commitable_code_suggestions = false
 dual_publishing_score_threshold=-1 # -1 to disable, [0-10] to set the threshold (>=) for publishing a code suggestion both in a table and as commitable
 focus_only_on_problems=true
+findings_metadata = false
+findings_metadata_badges = false
+findings_filter_mode = "off"
+small_file_context = false
+small_file_context_max_lines = 150
+small_file_context_max_tokens = 3500
 #
 extra_instructions = ""
 enable_help_text=false