diff --git a/pyproject.toml b/pyproject.toml index 74705364b..08acad323 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,8 @@ dependencies = [ "problog >= 2.2.6,<3.0.0", "cryptography >=44.0.0,<45.0.0", "semgrep == 1.113.0", + "pydantic >= 2.11.5,<2.12.0", + "gradio_client == 1.4.3", ] keywords = [] # https://pypi.org/classifiers/ diff --git a/src/macaron/ai/README.md b/src/macaron/ai/README.md new file mode 100644 index 000000000..9fc5b0b30 --- /dev/null +++ b/src/macaron/ai/README.md @@ -0,0 +1,50 @@ +# Macaron AI Module + +This module provides the foundation for interacting with Large Language Models (LLMs) in a provider-agnostic way. It includes an abstract client definition, provider-specific client implementations, a client factory, and utility functions for processing responses. + +## Module Components + +- **ai_client.py** + Defines the abstract [`AIClient`](./clients/base.py) class. This class handles the initialization of LLM configuration from the defaults and serves as the base for all specific AI client implementations. + +- **openai_client.py** + Implements the [`OpenAiClient`](./clients/openai_client.py) class, a concrete subclass of [`AIClient`](./ai_client.py). This client interacts with OpenAI-like APIs by sending requests using HTTP and processing the responses. It also validates and structures responses using the tools provided. + +- **ai_factory.py** + Contains the [`AIClientFactory`](./clients/base.py) class, which is responsible for reading provider configuration from the defaults and creating the correct AI client instance. + +- **ai_tools.py** + Offers utility functions such as `structure_response` to assist with parsing and validating the JSON response returned by an LLM. These functions ensure that responses conform to a given Pydantic model for easier downstream processing. + +## Usage + +1. **Configuration:** + The module reads the LLM configuration from the application defaults (using the `defaults` module). Make sure that the `llm` section in your configuration includes valid settings such as `enabled`, `api_key`, `api_endpoint`, `model`, and `context_window`. + +2. **Creating a Client:** + Use the [`AIClientFactory`](./clients/ai_factory.py) to create an AI client instance. The factory checks the configured provider and returns a client (e.g., an instance of [`OpenAiClient`](./clients/openai_client.py)) that can be used to invoke the LLM. + + Example: + ```py + from macaron.ai.clients.ai_factory import AIClientFactory + + factory = AIClientFactory() + client = factory.create_client(system_prompt="You are a helpful assistant.") + response = client.invoke("Hello, how can you assist me?") + print(response) + ``` + +3. **Response Processing:** + When a structured response is required, pass a Pydantic model class to the `invoke` method. The [`ai_tools.py`](./ai_tools.py) module takes care of parsing and validating the response to ensure it meets the expected structure. + +## Logging and Error Handling + +- The module uses Python's logging framework to report important events, such as token usage and warnings when prompts exceed the allowed context window. +- Configuration errors (e.g., missing API key or endpoint) are handled by raising descriptive exceptions, such as those defined in the [`ConfigurationError`](../errors.py). + +## Extensibility + +The design of the AI module is provider-agnostic. To add support for additional LLM providers: +- Implement a new client by subclassing [`AIClient`](./clients/base.py). +- Add the new client to the [`PROVIDER_MAPPING`](./clients/ai_factory.py). +- Update the configuration defaults accordingly. diff --git a/src/macaron/ai/__init__.py b/src/macaron/ai/__init__.py new file mode 100644 index 000000000..8e17a3508 --- /dev/null +++ b/src/macaron/ai/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. diff --git a/src/macaron/ai/ai_tools.py b/src/macaron/ai/ai_tools.py new file mode 100644 index 000000000..d5704a80c --- /dev/null +++ b/src/macaron/ai/ai_tools.py @@ -0,0 +1,43 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module provides utility functions for Large Language Model (LLM).""" +import json +import logging +import re +from typing import Any + +logger: logging.Logger = logging.getLogger(__name__) + + +def extract_json(response_text: str) -> Any: + """ + Parse the response from the LLM. + + If raw JSON parsing fails, attempts to extract a JSON object from text. + + Parameters + ---------- + response_text: str + The response text from the LLM. + + Returns + ------- + dict[str, Any] | None + The structured JSON object. + """ + try: + data = json.loads(response_text) + except json.JSONDecodeError: + logger.debug("Full JSON parse failed; trying to extract JSON from text.") + # If the response is not a valid JSON, try to extract a JSON object from the text. + match = re.search(r"\{.*\}", response_text, re.DOTALL) + if not match: + return None + try: + data = json.loads(match.group(0)) + except json.JSONDecodeError as e: + logger.debug("Failed to parse extracted JSON: %s", e) + return None + + return data diff --git a/src/macaron/ai/clients/__init__.py b/src/macaron/ai/clients/__init__.py new file mode 100644 index 000000000..7450cef22 --- /dev/null +++ b/src/macaron/ai/clients/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module provides a mapping of AI client providers to their respective client classes.""" + +from macaron.ai.clients.base import AIClient +from macaron.ai.clients.openai_client import OpenAiClient + +PROVIDER_MAPPING: dict[str, type[AIClient]] = {"openai": OpenAiClient} diff --git a/src/macaron/ai/clients/ai_factory.py b/src/macaron/ai/clients/ai_factory.py new file mode 100644 index 000000000..5df841efe --- /dev/null +++ b/src/macaron/ai/clients/ai_factory.py @@ -0,0 +1,62 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module defines the AIClientFactory class for creating AI clients based on provider configuration.""" + +import logging + +from macaron.ai.clients import PROVIDER_MAPPING +from macaron.ai.clients.base import AIClient +from macaron.config.defaults import defaults +from macaron.errors import ConfigurationError + +logger: logging.Logger = logging.getLogger(__name__) + + +class AIClientFactory: + """Factory to create AI clients based on provider configuration.""" + + def __init__(self) -> None: + """ + Initialize the AI client. + + The LLM configuration is read from defaults. + """ + self.params = self._load_defaults() + + def _load_defaults(self) -> dict | None: + section_name = "llm" + default_values = { + "enabled": False, + "provider": "", + "api_key": "", + "api_endpoint": "", + "model": "", + } + + if defaults.has_section(section_name): + section = defaults[section_name] + default_values["enabled"] = section.getboolean("enabled", default_values["enabled"]) + for key, default_value in default_values.items(): + if isinstance(default_value, str): + default_values[key] = str(section.get(key, default_value)).strip().lower() + + if default_values["enabled"]: + for key, value in default_values.items(): + if not value: + raise ConfigurationError( + f"AI client configuration '{key}' is required but not set in the defaults." + ) + + return default_values + + def create_client(self, system_prompt: str) -> AIClient | None: + """Create an AI client based on the configured provider.""" + if not self.params or not self.params["enabled"]: + return None + + client_class = PROVIDER_MAPPING.get(self.params["provider"]) + if client_class is None: + logger.error("Provider '%s' is not supported.", self.params["provider"]) + return None + return client_class(system_prompt, self.params) diff --git a/src/macaron/ai/clients/base.py b/src/macaron/ai/clients/base.py new file mode 100644 index 000000000..5177ae8aa --- /dev/null +++ b/src/macaron/ai/clients/base.py @@ -0,0 +1,45 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module defines the abstract AIClient class for implementing AI clients.""" + +from abc import ABC, abstractmethod + + +class AIClient(ABC): + """This abstract class is used to implement ai clients.""" + + def __init__(self, system_prompt: str, params: dict) -> None: + """ + Initialize the AI client. + + The LLM configuration is read from defaults. + """ + self.system_prompt = system_prompt + self.params = params + + @abstractmethod + def invoke( + self, + user_prompt: str, + temperature: float = 0.2, + response_format: dict | None = None, + ) -> dict: + """ + Invoke the LLM and optionally validate its response. + + Parameters + ---------- + user_prompt: str + The user prompt to send to the LLM. + temperature: float + The temperature for the LLM response. + response_format: dict | None + The json schema to validate the response against. + + Returns + ------- + dict + The validated schema if `response_format` is provided, + or the raw string response if not. + """ diff --git a/src/macaron/ai/clients/openai_client.py b/src/macaron/ai/clients/openai_client.py new file mode 100644 index 000000000..c788cab45 --- /dev/null +++ b/src/macaron/ai/clients/openai_client.py @@ -0,0 +1,89 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module provides a client for interacting with a Large Language Model (LLM) that is Openai like.""" + +import logging +from typing import Any, TypeVar + +from pydantic import BaseModel + +from macaron.ai.ai_tools import extract_json +from macaron.ai.clients.base import AIClient +from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError +from macaron.util import send_post_http_raw + +logger: logging.Logger = logging.getLogger(__name__) + +T = TypeVar("T", bound=BaseModel) + + +class OpenAiClient(AIClient): + """A client for interacting with a Large Language Model that is OpenAI API like.""" + + def invoke( + self, + user_prompt: str, + temperature: float = 0.2, + response_format: dict | None = None, + max_tokens: int = 4000, + timeout: int = 30, + ) -> Any: + """ + Invoke the LLM and optionally validate its response. + + Parameters + ---------- + user_prompt: str + The user prompt to send to the LLM. + temperature: float + The temperature for the LLM response. + response_format: dict + The json schema to validate the response against. If provided, the response will be parsed and validated. + max_tokens: int + The maximum number of tokens for the LLM response. + timeout: int + The timeout for the HTTP request in seconds. + + Returns + ------- + Optional[T | str] + The validated Pydantic model instance if `structured_output` is provided, + or the raw string response if not. + + Raises + ------ + HeuristicAnalyzerValueError + If there is an error in parsing or validating the response. + """ + if not self.params["enabled"]: + raise ConfigurationError("AI client is not enabled. Please check your configuration.") + + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.params['api_key']}"} + payload = { + "model": self.params["model"], + "messages": [{"role": "system", "content": self.system_prompt}, {"role": "user", "content": user_prompt}], + "response_format": response_format, + "temperature": temperature, + "max_tokens": max_tokens, + } + + try: + response = send_post_http_raw( + url=self.params["api_endpoint"], json_data=payload, headers=headers, timeout=timeout + ) + if not response: + raise HeuristicAnalyzerValueError("No response received from the LLM.") + response_json = response.json() + usage = response_json.get("usage", {}) + + if usage: + usage_str = ", ".join(f"{key} = {value}" for key, value in usage.items()) + logger.info("LLM call token usage: %s", usage_str) + + message_content = response_json["choices"][0]["message"]["content"] + return extract_json(message_content) + + except Exception as e: + logger.error("Error during LLM invocation: %s", e) + raise HeuristicAnalyzerValueError(f"Failed to get or validate LLM response: {e}") from e diff --git a/src/macaron/ai/prompts/__init__.py b/src/macaron/ai/prompts/__init__.py new file mode 100644 index 000000000..8e17a3508 --- /dev/null +++ b/src/macaron/ai/prompts/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. diff --git a/src/macaron/ai/schemas/__init__.py b/src/macaron/ai/schemas/__init__.py new file mode 100644 index 000000000..8e17a3508 --- /dev/null +++ b/src/macaron/ai/schemas/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index 0c31aaca7..5f7f031f2 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -635,3 +635,18 @@ custom_semgrep_rules_path = # .yaml prefix. Note, this will be ignored if a path to custom semgrep rules is not provided. This list may not contain # duplicated elements, meaning that ruleset names must be unique. disabled_custom_rulesets = + +[llm] +# The LLM configuration for Macaron. +# If enabled, the LLM will be used to analyze the results and provide insights. +enabled = False +# The provider for the LLM service. +# Supported providers : +# - openai: OpenAI's GPT models. +provider = +# The API key for the LLM service. +api_key = +# The API endpoint for the LLM service. +api_endpoint = +# The model to use for the LLM service. +model = diff --git a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py index eebce5764..db5e61515 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py @@ -43,6 +43,12 @@ class Heuristics(str, Enum): #: Indicates that the package source code contains suspicious code patterns. SUSPICIOUS_PATTERNS = "suspicious_patterns" + #: Indicates that the package contains some code that doesn't match the docstrings. + MATCHING_DOCSTRINGS = "matching_docstrings" + + #: Indicates that the package description is inconsistent. + INCONSISTENT_DESCRIPTION = "inconsistent_description" + class HeuristicResult(str, Enum): """Result type indicating the outcome of a heuristic.""" diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py new file mode 100644 index 000000000..31fd7ac38 --- /dev/null +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py @@ -0,0 +1,107 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This analyzer checks if a PyPI package has unsecure description.""" + +import logging + +from macaron.ai.clients.ai_factory import AIClientFactory +from macaron.errors import HeuristicAnalyzerValueError +from macaron.json_tools import JsonType, json_extract +from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics +from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset + +logger: logging.Logger = logging.getLogger(__name__) + + +class InconsistentDescriptionAnalyzer(BaseHeuristicAnalyzer): + """Check whether the package's description is unsecure.""" + + SYSTEM_PROMPT = """ + You are a security expert analyzing a PyPI package. Determine if the package description is secure. + you must score between 0 and 100 based on the following criteria: + - High-level description summary + - Benefit + - How to install + - How to use + - Are the "How to use", "High-level description summary", and "Benefit" sections consistent? + + Wrap the output in `json` tags. + Your response must be a JSON object matching this schema: + { + "score": 0-100, + "reason": "A short explanation.", + } + """ + + THRESHOLD = 60 + + RESPONSE_FORMAT = { + "type": "json_schema", + "json_schema": { + "name": "result_schema", + "strict": True, + "schema": { + "type": "object", + "properties": { + "score": { + "type": "integer", + "minimum": 0, + "maximum": 100, + "description": "The final score from 0 to 100 after analysing the packages' description.", + }, + "reason": { + "type": "string", + "description": "The reason for the overall score. It should be a short sentence explaining the decision.", + }, + }, + "required": ["score", "reason"], + }, + }, + } + + def __init__(self) -> None: + super().__init__( + name="inconsistent_description_analyzer", heuristic=Heuristics.INCONSISTENT_DESCRIPTION, depends_on=None + ) + factory = AIClientFactory() + self.client = factory.create_client(self.SYSTEM_PROMPT.strip()) + + def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: + """Analyze the package. + + Parameters + ---------- + pypi_package_json: PyPIPackageJsonAsset + The PyPI package JSON asset object. + + Returns + ------- + tuple[HeuristicResult, dict[str, JsonType]]: + The result and related information collected during the analysis. + """ + if not self.client: + return HeuristicResult.SKIP, {} + + package_json = pypi_package_json.package_json + info = package_json.get("info", {}) + if not info: + error_msg = "No package info found in metadata" + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) + + description = json_extract(package_json, ["info", "description"], str) + if not description or not description.strip(): + return HeuristicResult.FAIL, {"message": "No description found."} + + analysis_result = self.client.invoke( + user_prompt=description, + response_format=self.RESPONSE_FORMAT, + ) + + if analysis_result["score"] < self.THRESHOLD: + return HeuristicResult.FAIL, { + "message": f"inconsistent description with score {analysis_result['score']}. because {analysis_result['reason']}" + } + return HeuristicResult.PASS, {"message": f"consistent description with a {analysis_result['score']} score."} diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py new file mode 100644 index 000000000..a1e7fd1c0 --- /dev/null +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py @@ -0,0 +1,110 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This analyzer checks the iconsistency of code with its docstrings.""" + +import logging +import time + +from macaron.ai.clients.ai_factory import AIClientFactory +from macaron.json_tools import JsonType +from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics +from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset + +logger: logging.Logger = logging.getLogger(__name__) + + +class MatchingDocstringsAnalyzer(BaseHeuristicAnalyzer): + """Check whether the docstrings and the code components are consistent.""" + + SYSTEM_PROMPT = """ + You are a code master who can detect the inconsistency of the code with the docstrings that describes its components. + You will be given a python code file. Your task is to determine whether the code is consistent with the docstrings. + Wrap the output in `json` tags. + Your response must be a JSON object matching this schema: + { + "decision": "'consistent' or 'inconsistent'", + "reason": "A short explanation.", "inconsistent_code_part": + "The inconsistent code, or null." + } + + /no_think + """ + + REQUEST_INTERVAL = 0.5 + + RESPONSE_FORMAT = { + "type": "json_schema", + "json_schema": { + "name": "result_schema", + "strict": True, + "schema": { + "type": "object", + "properties": { + "decision": { + "type": "string", + "enum": ["consistent", "inconsistent"], + "description": """The final decision after analysing the code with its docstrings. + It can be either 'consistent' or 'inconsistent'.""", + }, + "reason": { + "type": "string", + "description": "The reason for the decision made.", + }, + "inconsistent_code_part": { + "type": ["string", "null"], + "description": """The specific part of the code that is inconsistent with the docstring. + Empty if the decision is 'consistent'.""", + }, + }, + }, + "required": ["decision", "reason", "inconsistent_code_part"], + }, + } + + def __init__(self) -> None: + super().__init__( + name="matching_docstrings_analyzer", + heuristic=Heuristics.MATCHING_DOCSTRINGS, + depends_on=None, + ) + factory = AIClientFactory() + self.client = factory.create_client(self.SYSTEM_PROMPT.strip()) + + def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: + """Analyze the package. + + Parameters + ---------- + pypi_package_json: PyPIPackageJsonAsset + The PyPI package JSON asset object. + + Returns + ------- + tuple[HeuristicResult, dict[str, JsonType]]: + The result and related information collected during the analysis. + """ + if not self.client: + return HeuristicResult.SKIP, {} + + download_result = pypi_package_json.download_sourcecode() + if not download_result: + logger.warning("No source code found for the package, skipping the matching docstrings analysis.") + return HeuristicResult.SKIP, {} + + for file, content in pypi_package_json.iter_sourcecode(): + if file.endswith(".py"): + time.sleep(self.REQUEST_INTERVAL) # Respect the request interval to avoid rate limiting. + code_str = content.decode("utf-8", "ignore") + analysis_result = self.client.invoke( + user_prompt=code_str, + response_format=self.RESPONSE_FORMAT, + ) + if analysis_result["decision"] == "inconsistent": + return HeuristicResult.FAIL, { + "file": file, + "reason": analysis_result["reason"], + "inconsistent part": analysis_result["inconsistent_code_part"] or "", + } + return HeuristicResult.PASS, {} diff --git a/src/macaron/slsa_analyzer/build_tool/gradle.py b/src/macaron/slsa_analyzer/build_tool/gradle.py index 2cc491934..607e98579 100644 --- a/src/macaron/slsa_analyzer/build_tool/gradle.py +++ b/src/macaron/slsa_analyzer/build_tool/gradle.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the Gradle class which inherits BaseBuildTool. @@ -122,7 +122,7 @@ def get_dep_analyzer(self) -> CycloneDxGradle: raise DependencyAnalyzerError("No default dependency analyzer is found.") if not DependencyAnalyzer.tool_valid(defaults.get("dependency.resolver", "dep_tool_gradle")): raise DependencyAnalyzerError( - f"Dependency analyzer {defaults.get('dependency.resolver','dep_tool_gradle')} is not valid.", + f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.", ) tool_name, tool_version = tuple( diff --git a/src/macaron/slsa_analyzer/build_tool/maven.py b/src/macaron/slsa_analyzer/build_tool/maven.py index 69323ad9c..e6c11c13e 100644 --- a/src/macaron/slsa_analyzer/build_tool/maven.py +++ b/src/macaron/slsa_analyzer/build_tool/maven.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the Maven class which inherits BaseBuildTool. @@ -116,7 +116,7 @@ def get_dep_analyzer(self) -> CycloneDxMaven: raise DependencyAnalyzerError("No default dependency analyzer is found.") if not DependencyAnalyzer.tool_valid(defaults.get("dependency.resolver", "dep_tool_maven")): raise DependencyAnalyzerError( - f"Dependency analyzer {defaults.get('dependency.resolver','dep_tool_maven')} is not valid.", + f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_maven')} is not valid.", ) tool_name, tool_version = tuple( diff --git a/src/macaron/slsa_analyzer/build_tool/pip.py b/src/macaron/slsa_analyzer/build_tool/pip.py index 5abf0c0ba..c0e970ab9 100644 --- a/src/macaron/slsa_analyzer/build_tool/pip.py +++ b/src/macaron/slsa_analyzer/build_tool/pip.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the Pip class which inherits BaseBuildTool. @@ -88,7 +88,7 @@ def get_dep_analyzer(self) -> DependencyAnalyzer: tool_name = "cyclonedx_py" if not DependencyAnalyzer.tool_valid(f"{tool_name}:{cyclonedx_version}"): raise DependencyAnalyzerError( - f"Dependency analyzer {defaults.get('dependency.resolver','dep_tool_gradle')} is not valid.", + f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.", ) return CycloneDxPython( resources_path=global_config.resources_path, diff --git a/src/macaron/slsa_analyzer/build_tool/poetry.py b/src/macaron/slsa_analyzer/build_tool/poetry.py index eeb54216b..54e3899f1 100644 --- a/src/macaron/slsa_analyzer/build_tool/poetry.py +++ b/src/macaron/slsa_analyzer/build_tool/poetry.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the Poetry class which inherits BaseBuildTool. @@ -126,7 +126,7 @@ def get_dep_analyzer(self) -> DependencyAnalyzer: tool_name = "cyclonedx_py" if not DependencyAnalyzer.tool_valid(f"{tool_name}:{cyclonedx_version}"): raise DependencyAnalyzerError( - f"Dependency analyzer {defaults.get('dependency.resolver','dep_tool_gradle')} is not valid.", + f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.", ) return CycloneDxPython( resources_path=global_config.resources_path, diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 8514a458d..7c28c365e 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -21,11 +21,13 @@ from macaron.malware_analyzer.pypi_heuristics.metadata.closer_release_join_date import CloserReleaseJoinDateAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.empty_project_link import EmptyProjectLinkAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.high_release_frequency import HighReleaseFrequencyAnalyzer +from macaron.malware_analyzer.pypi_heuristics.metadata.inconsistent_description import InconsistentDescriptionAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.one_release import OneReleaseAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.source_code_repo import SourceCodeRepoAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.typosquatting_presence import TyposquattingPresenceAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.unchanged_release import UnchangedReleaseAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_absence import WheelAbsenceAnalyzer +from macaron.malware_analyzer.pypi_heuristics.sourcecode.matching_docstrings import MatchingDocstringsAnalyzer from macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer from macaron.slsa_analyzer.analyze_context import AnalyzeContext @@ -358,6 +360,9 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: WheelAbsenceAnalyzer, AnomalousVersionAnalyzer, TyposquattingPresenceAnalyzer, + MatchingDocstringsAnalyzer, + # This heuristic is not used in any combination below, some tests needed before doing that. + InconsistentDescriptionAnalyzer, ] # name used to query the result of all problog rules, so it can be accessed outside the model. @@ -425,6 +430,10 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: failed({Heuristics.ONE_RELEASE.value}), failed({Heuristics.ANOMALOUS_VERSION.value}). + % Package released with a name similar to a popular package. + {Confidence.MEDIUM.value}::trigger(malware_medium_confidence_3) :- + quickUndetailed, forceSetup, failed({Heuristics.MATCHING_DOCSTRINGS.value}). + % ----- Evaluation ----- % Aggregate result @@ -432,6 +441,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: {problog_result_access} :- trigger(malware_high_confidence_2). {problog_result_access} :- trigger(malware_high_confidence_3). {problog_result_access} :- trigger(malware_high_confidence_4). + {problog_result_access} :- trigger(malware_medium_confidence_3). {problog_result_access} :- trigger(malware_medium_confidence_2). {problog_result_access} :- trigger(malware_medium_confidence_1). query({problog_result_access}). diff --git a/tests/malware_analyzer/pypi/test_inconsistent_description.py b/tests/malware_analyzer/pypi/test_inconsistent_description.py new file mode 100644 index 000000000..69a557c41 --- /dev/null +++ b/tests/malware_analyzer/pypi/test_inconsistent_description.py @@ -0,0 +1,75 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Tests for the InconsistentDescriptionAnalyzer heuristic.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from macaron.errors import HeuristicAnalyzerValueError +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult +from macaron.malware_analyzer.pypi_heuristics.metadata.inconsistent_description import InconsistentDescriptionAnalyzer + + +@pytest.fixture(name="analyzer") +def analyzer_() -> InconsistentDescriptionAnalyzer: + """Pytest fixture to create an InconsistentDescriptionAnalyzer instance.""" + return InconsistentDescriptionAnalyzer() + + +@pytest.fixture(autouse=True) +def skip_if_client_disabled(analyzer: InconsistentDescriptionAnalyzer) -> None: + """ + Automatically skip tests in this file if the AI client is disabled. + """ + if not analyzer.client: + pytest.skip("AI client disabled - skipping test") + + +def test_analyze_consistent_description_pass( + analyzer: InconsistentDescriptionAnalyzer, pypi_package_json: MagicMock +) -> None: + """Test the analyzer passes when the description is consistent.""" + pypi_package_json.package_json = {"info": {"description": "This is a test package."}} + mock_result = {"score": 80, "reason": "The description is consistent."} + + with patch.object(analyzer.client, "invoke", return_value=mock_result) as mock_invoke: + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.PASS + assert isinstance(info["message"], str) + assert "consistent description with a 80 score" in info["message"] + mock_invoke.assert_called_once() + + +def test_analyze_inconsistent_description_fail( + analyzer: InconsistentDescriptionAnalyzer, pypi_package_json: MagicMock +) -> None: + """Test the analyzer fails when the description is inconsistent.""" + pypi_package_json.package_json = {"info": {"description": "This is a misleading package."}} + mock_result = {"score": 30, "reason": "The description is misleading."} + + with patch.object(analyzer.client, "invoke", return_value=mock_result) as mock_invoke: + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.FAIL + assert isinstance(info["message"], str) + assert "inconsistent description with score 30" in info["message"] + assert "because The description is misleading" in info["message"] + mock_invoke.assert_called_once() + + +def test_analyze_no_description_fail(analyzer: InconsistentDescriptionAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer fails if there is no description.""" + pypi_package_json.package_json = {"info": {"description": " "}} + with patch.object(analyzer.client, "invoke") as mock_invoke: + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.FAIL + assert info["message"] == "No description found." + mock_invoke.assert_not_called() + + +def test_analyze_no_info_raises_error(analyzer: InconsistentDescriptionAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer raises an error if the package JSON has no 'info' field.""" + pypi_package_json.package_json = {} + with pytest.raises(HeuristicAnalyzerValueError): + analyzer.analyze(pypi_package_json) diff --git a/tests/malware_analyzer/pypi/test_matching_docstrings.py b/tests/malware_analyzer/pypi/test_matching_docstrings.py new file mode 100644 index 000000000..dbdcddebe --- /dev/null +++ b/tests/malware_analyzer/pypi/test_matching_docstrings.py @@ -0,0 +1,99 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Tests for the MatchingDocstringsAnalyzer heuristic.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult +from macaron.malware_analyzer.pypi_heuristics.sourcecode.matching_docstrings import MatchingDocstringsAnalyzer + + +@pytest.fixture(name="analyzer") +def analyzer_() -> MatchingDocstringsAnalyzer: + """Pytest fixture to create a MatchingDocstringsAnalyzer instance.""" + return MatchingDocstringsAnalyzer() + + +@pytest.fixture(autouse=True) +def skip_if_client_disabled(analyzer: MatchingDocstringsAnalyzer) -> None: + """ + Automatically skip tests in this file if the AI client is disabled. + """ + if not analyzer.client: + pytest.skip("AI client disabled - skipping test") + + +def test_analyze_consistent_docstrings_pass(analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer passes when docstrings are consistent with the code.""" + pypi_package_json.download_sourcecode.return_value = True + pypi_package_json.iter_sourcecode.return_value = [("test.py", b"def func():\n '''docstring'''\n pass")] + + mock_result = { + "decision": "consistent", + "reason": "The code is consistent with the docstring.", + "inconsistent_code_part": None, + } + + with patch.object(analyzer.client, "invoke", return_value=mock_result) as mock_invoke: + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.PASS + assert not info + mock_invoke.assert_called_once() + + +def test_analyze_inconsistent_docstrings_fail( + analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock +) -> None: + """Test the analyzer fails when docstrings are inconsistent with the code.""" + pypi_package_json.download_sourcecode.return_value = True + pypi_package_json.iter_sourcecode.return_value = [ + ("test.py", b"def func():\n '''docstring'''\n print('hello')") + ] + + mock_result = { + "decision": "inconsistent", + "reason": "The docstring does not mention the print statement.", + "inconsistent_code_part": "print('hello')", + } + + with patch.object(analyzer.client, "invoke", return_value=mock_result): + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.FAIL + assert info["file"] == "test.py" + assert info["reason"] == "The docstring does not mention the print statement." + assert info["inconsistent part"] == "print('hello')" + + +def test_analyze_no_source_code_skip(analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer skips if the source code cannot be downloaded.""" + pypi_package_json.download_sourcecode.return_value = False + with patch.object(analyzer.client, "invoke") as mock_invoke: + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.SKIP + assert not info + mock_invoke.assert_not_called() + + +def test_analyze_no_python_files_pass(analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer passes if there are no Python files in the source code.""" + pypi_package_json.download_sourcecode.return_value = True + pypi_package_json.iter_sourcecode.return_value = [("README.md", b"This is a test package.")] + with patch.object(analyzer.client, "invoke") as mock_invoke: + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.PASS + assert not info + mock_invoke.assert_not_called() + + +def test_analyze_llm_invocation_error_pass(analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer passes if the LLM invocation returns None (e.g., on API error).""" + pypi_package_json.download_sourcecode.return_value = True + pypi_package_json.iter_sourcecode.return_value = [("test.py", b"def func():\n pass")] + + with patch.object(analyzer.client, "invoke", return_value=None): + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.PASS + assert not info