Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions pr_agent/algo/context_enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,31 @@ def append_small_file_context_to_diff(
return f"{diff_text}\n\nAdditional context for small touched files:\n======\n{context}\n======"


def append_small_file_context_to_diffs(
diff_texts: list[str],
diff_files: list[FilePatchInfo],
token_handler,
*,
max_lines: int,
max_tokens: int,
) -> list[str]:
if not diff_texts:
return diff_texts

return [
append_small_file_context_to_diff(
diff_text,
diff_files,
token_handler,
max_lines=max_lines,
max_tokens=max_tokens,
)
if diff_text
else diff_text
for diff_text in diff_texts
]


def build_small_file_context(
diff_text: str,
diff_files: list[FilePatchInfo],
Expand Down
116 changes: 116 additions & 0 deletions pr_agent/algo/suggestion_output_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import copy
import logging


VALID_CONFIDENCE_VALUES = {"high", "medium", "low"}
VALID_EVIDENCE_TYPES = {"diff", "ticket", "inferred"}
FINDINGS_FILTER_MODE_OFF = "off"
FINDINGS_FILTER_MODE_DROP_LOW_CONFIDENCE_INFERRED = "drop_low_confidence_inferred"
LOGGER = logging.getLogger(__name__)


def normalize_code_suggestions_output(
data: dict,
findings_metadata: bool = False,
filter_mode: str = FINDINGS_FILTER_MODE_OFF,
) -> dict:
if not isinstance(data, dict) or "code_suggestions" not in data or not isinstance(data["code_suggestions"], list):
return data

if filter_mode != FINDINGS_FILTER_MODE_OFF and not findings_metadata:
_log_warning("findings_filter_mode has no effect when findings_metadata is false")
filter_mode = FINDINGS_FILTER_MODE_OFF
elif filter_mode not in {FINDINGS_FILTER_MODE_OFF, FINDINGS_FILTER_MODE_DROP_LOW_CONFIDENCE_INFERRED}:
_log_warning(f"Unknown findings_filter_mode: {filter_mode!r}, defaulting to off")
filter_mode = FINDINGS_FILTER_MODE_OFF

normalized_data = copy.deepcopy(data)
suggestions = normalized_data.get("code_suggestions")
if not isinstance(suggestions, list):
return normalized_data

normalized_suggestions = []
for suggestion in suggestions:
normalized_suggestion = normalize_suggestion(suggestion, findings_metadata=findings_metadata)
if normalized_suggestion:
normalized_suggestions.append(normalized_suggestion)

filtered_suggestions, suppressed_count = filter_suggestions(normalized_suggestions, filter_mode=filter_mode)
if suppressed_count:
_log_debug("Suppressed low-confidence inferred code suggestions", artifact={"count": suppressed_count})
normalized_data["code_suggestions"] = filtered_suggestions

return normalized_data


def normalize_suggestion(suggestion: dict, findings_metadata: bool = False) -> dict | None:
if not isinstance(suggestion, dict):
return None

normalized_suggestion = copy.deepcopy(suggestion)

if findings_metadata:
confidence = normalize_enum_value(suggestion.get("confidence"), VALID_CONFIDENCE_VALUES)
if confidence:
normalized_suggestion["confidence"] = confidence
else:
normalized_suggestion.pop("confidence", None)

evidence_type = normalize_enum_value(suggestion.get("evidence_type"), VALID_EVIDENCE_TYPES)
if evidence_type:
normalized_suggestion["evidence_type"] = evidence_type
else:
normalized_suggestion.pop("evidence_type", None)
else:
normalized_suggestion.pop("confidence", None)
normalized_suggestion.pop("evidence_type", None)

return normalized_suggestion


def normalize_enum_value(value, valid_values: set[str]) -> str | None:
if value is None:
return None

normalized_value = str(value).strip().lower()
if normalized_value in valid_values:
return normalized_value

return None


def filter_suggestions(suggestions: list[dict], filter_mode: str = FINDINGS_FILTER_MODE_OFF) -> tuple[list[dict], int]:
if filter_mode != FINDINGS_FILTER_MODE_DROP_LOW_CONFIDENCE_INFERRED:
return suggestions, 0

filtered_suggestions = []
suppressed_count = 0
for suggestion in suggestions:
if suggestion.get("confidence") == "low" and suggestion.get("evidence_type") == "inferred":
suppressed_count += 1
continue
filtered_suggestions.append(suggestion)

return filtered_suggestions, suppressed_count


def _log_warning(message: str) -> None:
logger = _resolve_logger()
logger.warning(message)


def _log_debug(message: str, artifact: dict) -> None:
logger = _resolve_logger()
try:
logger.debug(message, artifact=artifact)
except TypeError:
logger.debug(message, extra={"artifact": artifact})


def _resolve_logger():
try:
from pr_agent.log import get_logger

return get_logger()
except Exception:
return LOGGER
34 changes: 28 additions & 6 deletions pr_agent/algo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,21 +380,43 @@ def format_review_issue_metadata(issue: dict, gfm_supported: bool, enable_badges
return " ".join([f"`{label}`" for label in metadata_labels])


def format_code_suggestion_metadata(label: str, score=None, enable_badges: bool | None = None) -> str:
def format_code_suggestion_metadata(label: str, score=None, confidence: str | None = None,
evidence_type: str | None = None,
enable_badges: bool | None = None) -> str:
if enable_badges is None:
enable_badges = get_settings().pr_reviewer.get("findings_metadata_badges", False)
enable_badges = get_settings().pr_code_suggestions.get(
"findings_metadata_badges",
get_settings().pr_reviewer.get("findings_metadata_badges", False),
)
if not enable_badges:
return ""

metadata_labels = []
label = str(label).strip()
if not label:
return ""
if label:
metadata_labels.append(label)

score_value = "" if score is None else str(score).strip()
if score_value:
return f" [{label}, importance: {score_value}]"
metadata_labels.append(f"importance: {score_value}")

normalized_confidence = str(confidence or "").strip().lower()
if normalized_confidence in {"high", "medium", "low"}:
metadata_labels.append(f"{normalized_confidence.capitalize()} confidence")

normalized_evidence_type = str(evidence_type or "").strip().lower()
evidence_labels = {
"diff": "Diff evidence",
"ticket": "Ticket evidence",
"inferred": "Inferred evidence",
}
if normalized_evidence_type in evidence_labels:
metadata_labels.append(evidence_labels[normalized_evidence_type])

if not metadata_labels:
return ""

return f" [{label}]"
return f" [{', '.join(metadata_labels)}]"


def extract_relevant_lines_str(end_line, files, relevant_file, start_line, dedent=False) -> str:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,16 @@ Specific guidelines for generating code suggestions:
{%- endif %}
- Be aware that your input consists only of partial code segments (PR diff code), not the complete codebase. Therefore, avoid making suggestions that might duplicate existing functionality, and refrain from questioning code elements (such as variable declarations or import statements) that may be defined elsewhere in the codebase.
- When mentioning code elements (variables, names, or files) in your response, surround them with backticks (`). For example: "verify that `user_id` is..."
{%- if small_file_context %}
- Some files may include a `## Full file context:` section with the complete post-change contents of a small touched file. Use this only to validate suggestions related to the changed file, and do not report issues from untouched regions unless they are necessary to explain a defect introduced by the PR.
{%- endif %}
{%- if findings_metadata %}
- For each suggestion, set `confidence` from `high`, `medium`, or `low` based only on the visible evidence.
- Set `evidence_type` to `diff` when changed lines directly support the suggestion, `ticket` for a clear ticket mismatch, or `inferred` when the suggestion depends on indirect clues.
{%- endif %}
- For lower-severity concerns, be certain before suggesting. If you cannot confidently explain why something is a problem with a concrete scenario, do not suggest it.
- Do not speculate that a change might break other code unless you can identify the specific affected code path from the visible diff context.
- When confidence is limited but the potential impact is high, make the uncertainty explicit. Otherwise, prefer returning fewer suggestions over guessing.

{%- if extra_instructions %}

Expand All @@ -92,6 +102,10 @@ class CodeSuggestion(BaseModel):
suggestion_content: str = Field(description="An actionable suggestion to enhance, improve or fix the new code introduced in the PR. Don't present here actual code snippets, just the suggestion. Be short and concise")
improved_code: str = Field(description="A refined code snippet that replaces the 'existing_code' snippet after implementing the suggestion.")
one_sentence_summary: str = Field(description="A concise, single-sentence overview (up to 6 words) of the suggested improvement. Focus on the 'what'. Be general, and avoid method or variable names.")
{%- if findings_metadata %}
confidence: str = Field(description="Your confidence in this suggestion based only on the visible evidence. Allowed values: high, medium, low")
evidence_type: str = Field(description="What directly supports this suggestion. Allowed values: diff, ticket, inferred")
{%- endif %}
{%- if not focus_only_on_problems %}
label: str = Field(description="A single, descriptive label that best characterizes the suggestion type. Possible labels include 'security', 'possible bug', 'possible issue', 'performance', 'enhancement', 'best practice', 'maintainability', 'typo'. Other relevant labels are also acceptable.")
{%- else %}
Expand Down Expand Up @@ -119,6 +133,12 @@ code_suggestions:
...
one_sentence_summary: |
...
{%- if findings_metadata %}
confidence: |
high
evidence_type: |
diff
{%- endif %}
label: |
...
```
Expand Down Expand Up @@ -158,6 +178,12 @@ code_suggestions:
...
one_sentence_summary: |
...
{%- if findings_metadata %}
confidence: |
high
evidence_type: |
diff
{%- endif %}
label: |
...
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,16 @@ Specific guidelines for generating code suggestions:
- When mentioning code elements (variables, names, or files) in your response, surround them with markdown backticks (`). For example: "verify that `user_id` is..."
- Note that you will only see partial code segments that were changed (diff hunks in a PR code), and not the entire codebase. Avoid suggestions that might duplicate existing functionality of the outer codebase. In addition, the absence of a definition, declaration, import, or initialization for any entity in the PR code is NEVER a basis for a suggestion.
- Also note that if the code ends at an opening brace or statement that begins a new scope (like 'if', 'for', 'try'), don't treat it as incomplete. Instead, acknowledge the visible scope boundary and analyze only the code shown.
{%- if small_file_context %}
- Some files may include a `## Full file context:` section with the complete post-change contents of a small touched file. Use this only to validate suggestions related to the changed file, and do not report issues from untouched regions unless they are necessary to explain a defect introduced by the PR.
{%- endif %}
{%- if findings_metadata %}
- For each suggestion, set `confidence` from `high`, `medium`, or `low` based only on the visible evidence.
- Set `evidence_type` to `diff` when changed lines directly support the suggestion, `ticket` for a clear ticket mismatch, or `inferred` when the suggestion depends on indirect clues.
{%- endif %}
- For lower-severity concerns, be certain before suggesting. If you cannot confidently explain why something is a problem with a concrete scenario, do not suggest it.
- Do not speculate that a change might break other code unless you can identify the specific affected code path from the visible diff context.
- When confidence is limited but the potential impact is high, make the uncertainty explicit. Otherwise, prefer returning fewer suggestions over guessing.

{%- if extra_instructions %}

Expand All @@ -81,6 +91,10 @@ class CodeSuggestion(BaseModel):
suggestion_content: str = Field(description="An actionable suggestion to enhance, improve or fix the new code introduced in the PR. Use 2-3 short sentences.")
improved_code: str = Field(description="A refined code snippet that replaces the 'existing_code' snippet after implementing the suggestion.")
one_sentence_summary: str = Field(description="A single-sentence overview (up to 6 words) of the suggestion. Focus on the 'what'. Be general, and avoid mentioning method or variable names.")
{%- if findings_metadata %}
confidence: str = Field(description="Your confidence in this suggestion based only on the visible evidence. Allowed values: high, medium, low")
evidence_type: str = Field(description="What directly supports this suggestion. Allowed values: diff, ticket, inferred")
{%- endif %}
{%- if not focus_only_on_problems %}
label: str = Field(description="A single, descriptive label that best characterizes the suggestion type. Possible labels include 'security', 'possible bug', 'possible issue', 'performance', 'enhancement', 'best practice', 'maintainability', 'typo'. Other relevant labels are also acceptable.")
{%- else %}
Expand Down Expand Up @@ -108,6 +122,12 @@ code_suggestions:
...
one_sentence_summary: |
...
{%- if findings_metadata %}
confidence: |
high
evidence_type: |
diff
{%- endif %}
label: |
...
```
Expand Down Expand Up @@ -147,6 +167,12 @@ code_suggestions:
...
one_sentence_summary: |
...
{%- if findings_metadata %}
confidence: |
high
evidence_type: |
diff
{%- endif %}
label: |
...
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Key guidelines for evaluation:
- Extend your review beyond the specifically mentioned code lines to encompass surrounding PR code context, verifying the suggestions' contextual accuracy.
- Validate the 'existing_code' field by confirming it matches or is accurately derived from code lines within a '__new hunk__' section of the PR code diff.
- Ensure the 'improved_code' section accurately reflects the 'existing_code' segment after the suggested modification is applied.
- If a suggestion depends on indirect clues rather than directly visible evidence, lower confidence accordingly. Suggestions with weak inferred evidence should receive a low score.
- Apply a nuanced scoring system:
- Reserve high scores (9-10) for critical issues such as major bugs or security concerns (merge-blocking, won't compile,
deterministic test failures, common-path crash, data loss/corruption, or clear contract breakage).
Expand Down Expand Up @@ -82,6 +83,13 @@ __old hunk__
{%- if is_ai_metadata %}
- When available, an AI-generated summary will precede each file's diff, with a high-level overview of the changes. Note that this summary may not be fully accurate or comprehensive.
{%- endif %}
{%- if small_file_context %}
- Some files may include a `## Full file context:` section with the complete post-change contents of a small touched file. Use this only to validate suggestions related to the changed file, and do not report issues from untouched regions unless they are necessary to explain a defect introduced by the PR.
{%- endif %}
{%- if findings_metadata %}
- For each suggestion, set `confidence` from `high`, `medium`, or `low` based only on the visible evidence.
- Set `evidence_type` to `diff` when changed lines directly support the suggestion, `ticket` for a clear ticket mismatch, or `inferred` when the suggestion depends on indirect clues.
{%- endif %}


The output must be a YAML object equivalent to type $PRCodeSuggestionsFeedback, according to the following Pydantic definitions:
Expand All @@ -92,6 +100,10 @@ class CodeSuggestionFeedback(BaseModel):
relevant_lines_start: int = Field(description="The relevant line number, from a '__new hunk__' section, where the suggestion starts (inclusive). Should be derived from the added '__new hunk__' line numbers, and correspond to the first line of the relevant 'existing code' snippet.")
relevant_lines_end: int = Field(description="The relevant line number, from a '__new hunk__' section, where the suggestion ends (inclusive). Should be derived from the added '__new hunk__' line numbers, and correspond to the end of the relevant 'existing code' snippet")
suggestion_score: int = Field(description="Evaluate the suggestion and assign a score from 0 to 10. Give 0 if the suggestion is wrong. For valid suggestions, score from 1 (lowest impact/importance) to 10 (highest impact/importance).")
{%- if findings_metadata %}
confidence: str = Field(description="Your confidence in this suggestion based only on the visible evidence. Allowed values: high, medium, low")
evidence_type: str = Field(description="What directly supports this suggestion. Allowed values: diff, ticket, inferred")
{%- endif %}
why: str = Field(description="Briefly explain the score given in 1-2 short sentences, focusing on the suggestion's impact, relevance, and accuracy. When mentioning code elements (variables, names, or files) in your response, surround them with markdown backticks (`).")

class PRCodeSuggestionsFeedback(BaseModel):
Expand All @@ -108,6 +120,12 @@ code_suggestions:
relevant_lines_start: 13
relevant_lines_end: 14
suggestion_score: 6
{%- if findings_metadata %}
confidence: |
medium
evidence_type: |
diff
{%- endif %}
why: |
The variable name 't' is not descriptive enough
- ...
Expand Down Expand Up @@ -141,6 +159,12 @@ code_suggestions:
relevant_lines_start: ...
relevant_lines_end: ...
suggestion_score: ...
{%- if findings_metadata %}
confidence: |
...
evidence_type: |
...
{%- endif %}
why: |
...
- ...
Expand Down
6 changes: 6 additions & 0 deletions pr_agent/settings/configuration.toml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,12 @@ use_conversation_history=true
commitable_code_suggestions = false
dual_publishing_score_threshold=-1 # -1 to disable, [0-10] to set the threshold (>=) for publishing a code suggestion both in a table and as commitable
focus_only_on_problems=true
findings_metadata = false
findings_metadata_badges = false
findings_filter_mode = "off"
small_file_context = false
small_file_context_max_lines = 150
small_file_context_max_tokens = 3500
#
extra_instructions = ""
enable_help_text=false
Expand Down
Loading