Skip to content

Commit ca1fc9c

Browse files
authored
feat(heuristics): add two analyzers to detect dependency confusion and distinguish from stub packages (#1117)
This PR adds three new heuristic analyzers designed to detect potential dependency confusion attacks and differentiate them from harmless stub or placeholder packages. Signed-off-by: Amine <amine.raouane@enim.ac.ma>
1 parent 8fe34e5 commit ca1fc9c

File tree

7 files changed

+316
-1
lines changed

7 files changed

+316
-1
lines changed

src/macaron/malware_analyzer/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,17 @@ When a heuristic fails, with `HeuristicResult.FAIL`, then that is an indicator b
7070
- **Rule**: Return `HeuristicResult.FAIL` if the email is invalid; otherwise, return `HeuristicResult.PASS`.
7171
- **Dependency**: None.
7272
73+
74+
12. **Minimal Content**
75+
- **Description**: Checks if the package has a small number of files.
76+
- **Rule**: Return `HeuristicResult.FAIL` if the number of files is strictly less than FILES_THRESHOLD; otherwise, return `HeuristicResult.PASS`.
77+
- **Dependency**: None.
78+
79+
13. **Unsecure Description**
80+
- **Description**: Checks if the package description is unsecure, such as not having a descriptive keywords that indicates its a stub package .
81+
- **Rule**: Return `HeuristicResult.FAIL` if no descriptive word is found in the package description or summary ; otherwise, return `HeuristicResult.PASS`.
82+
- **Dependency**: None.
83+
7384
### Source Code Analysis with Semgrep
7485
**PyPI Source Code Analyzer**
7586
- **Description**: Uses Semgrep, with default rules written in `src/macaron/resources/pypi_malware_rules` and custom rules available by supplying a path to `custom_semgrep_rules` in `defaults.ini`, to scan the package `.tar` source code.

src/macaron/malware_analyzer/pypi_heuristics/heuristics.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ class Heuristics(str, Enum):
4949
#: Indicates that the package has a similar structure to other packages maintained by the same user.
5050
SIMILAR_PROJECTS = "similar_projects"
5151

52+
#: Indicates that the package has minimal content.
53+
MINIMAL_CONTENT = "minimal_content"
54+
55+
#: Indicates that the package's description is unsecure, such as not having a descriptive keywords.
56+
UNSECURE_DESCRIPTION = "unsecure_description"
57+
5258

5359
class HeuristicResult(str, Enum):
5460
"""Result type indicating the outcome of a heuristic."""
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""This analyzer checks if a PyPI package has minimal content."""
5+
6+
import logging
7+
import os
8+
9+
from macaron.errors import SourceCodeError
10+
from macaron.json_tools import JsonType
11+
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
12+
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
13+
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
14+
15+
logger: logging.Logger = logging.getLogger(__name__)
16+
17+
18+
class MinimalContentAnalyzer(BaseHeuristicAnalyzer):
19+
"""Check whether the package has minimal content."""
20+
21+
FILES_THRESHOLD = 50
22+
23+
def __init__(self) -> None:
24+
super().__init__(
25+
name="minimal_content_analyzer",
26+
heuristic=Heuristics.MINIMAL_CONTENT,
27+
depends_on=None,
28+
)
29+
30+
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
31+
"""Analyze the package.
32+
33+
Parameters
34+
----------
35+
pypi_package_json: PyPIPackageJsonAsset
36+
The PyPI package JSON asset object.
37+
38+
Returns
39+
-------
40+
tuple[HeuristicResult, dict[str, JsonType]]:
41+
The result and related information collected during the analysis.
42+
"""
43+
result = pypi_package_json.download_sourcecode()
44+
if not result:
45+
error_msg = "No source code files have been downloaded"
46+
logger.debug(error_msg)
47+
raise SourceCodeError(error_msg)
48+
49+
file_count = sum(len(files) for _, _, files in os.walk(pypi_package_json.package_sourcecode_path))
50+
51+
if file_count >= self.FILES_THRESHOLD:
52+
return HeuristicResult.PASS, {"message": "Package has sufficient content"}
53+
54+
return HeuristicResult.FAIL, {"message": "Not enough files found"}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""This analyzer checks if a PyPI package has unsecure description."""
5+
6+
import logging
7+
import re
8+
9+
from macaron.errors import HeuristicAnalyzerValueError
10+
from macaron.json_tools import JsonType, json_extract
11+
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
12+
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
13+
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
14+
15+
logger: logging.Logger = logging.getLogger(__name__)
16+
17+
18+
class UnsecureDescriptionAnalyzer(BaseHeuristicAnalyzer):
19+
"""Check whether the package's description is unsecure."""
20+
21+
SECURE_DESCRIPTION_REGEX = re.compile(
22+
r"\b(?:internal|private|stub|placeholder|dependency confusion|security|namespace protection|reserved|harmless|prevent)\b",
23+
re.IGNORECASE,
24+
)
25+
26+
def __init__(self) -> None:
27+
super().__init__(
28+
name="unsecure_description_analyzer", heuristic=Heuristics.UNSECURE_DESCRIPTION, depends_on=None
29+
)
30+
31+
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
32+
"""Analyze the package.
33+
34+
Parameters
35+
----------
36+
pypi_package_json: PyPIPackageJsonAsset
37+
The PyPI package JSON asset object.
38+
39+
Returns
40+
-------
41+
tuple[HeuristicResult, dict[str, JsonType]]:
42+
The result and related information collected during the analysis.
43+
"""
44+
package_json = pypi_package_json.package_json
45+
info = package_json.get("info", {})
46+
if not info:
47+
error_msg = "No package info found in metadata"
48+
logger.debug(error_msg)
49+
raise HeuristicAnalyzerValueError(error_msg)
50+
51+
description = json_extract(package_json, ["info", "description"], str)
52+
summary = json_extract(package_json, ["info", "summary"], str)
53+
data = f"{description} {summary}"
54+
if self.SECURE_DESCRIPTION_REGEX.search(data):
55+
return HeuristicResult.PASS, {"message": "Package description is secure"}
56+
return HeuristicResult.FAIL, {"message": "Package description is unsecure"}

src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@
2222
from macaron.malware_analyzer.pypi_heuristics.metadata.empty_project_link import EmptyProjectLinkAnalyzer
2323
from macaron.malware_analyzer.pypi_heuristics.metadata.fake_email import FakeEmailAnalyzer
2424
from macaron.malware_analyzer.pypi_heuristics.metadata.high_release_frequency import HighReleaseFrequencyAnalyzer
25+
from macaron.malware_analyzer.pypi_heuristics.metadata.minimal_content import MinimalContentAnalyzer
2526
from macaron.malware_analyzer.pypi_heuristics.metadata.one_release import OneReleaseAnalyzer
2627
from macaron.malware_analyzer.pypi_heuristics.metadata.similar_projects import SimilarProjectAnalyzer
2728
from macaron.malware_analyzer.pypi_heuristics.metadata.source_code_repo import SourceCodeRepoAnalyzer
2829
from macaron.malware_analyzer.pypi_heuristics.metadata.typosquatting_presence import TyposquattingPresenceAnalyzer
2930
from macaron.malware_analyzer.pypi_heuristics.metadata.unchanged_release import UnchangedReleaseAnalyzer
31+
from macaron.malware_analyzer.pypi_heuristics.metadata.unsecure_description import UnsecureDescriptionAnalyzer
3032
from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_absence import WheelAbsenceAnalyzer
3133
from macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer
3234
from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer
@@ -366,6 +368,8 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
366368
TyposquattingPresenceAnalyzer,
367369
FakeEmailAnalyzer,
368370
SimilarProjectAnalyzer,
371+
UnsecureDescriptionAnalyzer,
372+
MinimalContentAnalyzer,
369373
]
370374

371375
# name used to query the result of all problog rules, so it can be accessed outside the model.
@@ -419,6 +423,13 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
419423
{Confidence.HIGH.value}::trigger(malware_high_confidence_4) :-
420424
quickUndetailed, forceSetup, failed({Heuristics.TYPOSQUATTING_PRESENCE.value}).
421425
426+
% Package released with dependency confusion .
427+
{Confidence.HIGH.value}::trigger(malware_high_confidence_5) :-
428+
forceSetup,
429+
passed({Heuristics.MINIMAL_CONTENT.value}),
430+
failed({Heuristics.ANOMALOUS_VERSION.value}),
431+
failed({Heuristics.UNSECURE_DESCRIPTION.value}).
432+
422433
% Package released recently with little detail, with multiple releases as a trust marker, but frequent and with
423434
% the same code.
424435
{Confidence.MEDIUM.value}::trigger(malware_medium_confidence_1) :-
@@ -431,7 +442,8 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
431442
{Confidence.MEDIUM.value}::trigger(malware_medium_confidence_2) :-
432443
quickUndetailed,
433444
failed({Heuristics.ONE_RELEASE.value}),
434-
failed({Heuristics.ANOMALOUS_VERSION.value}).
445+
failed({Heuristics.ANOMALOUS_VERSION.value}),
446+
failed({Heuristics.UNSECURE_DESCRIPTION.value}).
435447
436448
% Package has no links, one release or multiple quick releases, and a suspicious maintainer who recently
437449
% joined, has a fake email address, and other similarly-structured projects.
@@ -445,13 +457,15 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
445457
failed({Heuristics.SIMILAR_PROJECTS.value}),
446458
failed({Heuristics.HIGH_RELEASE_FREQUENCY.value}),
447459
failed({Heuristics.FAKE_EMAIL.value}).
460+
448461
% ----- Evaluation -----
449462
450463
% Aggregate result
451464
{problog_result_access} :- trigger(malware_high_confidence_1).
452465
{problog_result_access} :- trigger(malware_high_confidence_2).
453466
{problog_result_access} :- trigger(malware_high_confidence_3).
454467
{problog_result_access} :- trigger(malware_high_confidence_4).
468+
{problog_result_access} :- trigger(malware_high_confidence_5).
455469
{problog_result_access} :- trigger(malware_medium_confidence_1).
456470
{problog_result_access} :- trigger(malware_medium_confidence_2).
457471
{problog_result_access} :- trigger(malware_medium_confidence_3).
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""Tests for the MinimalContentAnalyzer heuristic."""
5+
6+
from unittest.mock import MagicMock, patch
7+
8+
import pytest
9+
10+
from macaron.errors import SourceCodeError
11+
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult
12+
from macaron.malware_analyzer.pypi_heuristics.metadata.minimal_content import MinimalContentAnalyzer
13+
14+
15+
@pytest.fixture(name="analyzer")
16+
def analyzer_() -> MinimalContentAnalyzer:
17+
"""Pytest fixture to create a MinimalContentAnalyzer instance."""
18+
return MinimalContentAnalyzer()
19+
20+
21+
def test_analyze_sufficient_files_pass(analyzer: MinimalContentAnalyzer, pypi_package_json: MagicMock) -> None:
22+
"""Test the analyzer passes when the package has sufficient files."""
23+
pypi_package_json.download_sourcecode.return_value = True
24+
pypi_package_json.package_sourcecode_path = "/fake/path"
25+
with patch("os.walk") as mock_walk:
26+
mock_walk.return_value = [("root", [], [f"file{i}.py" for i in range(60)])]
27+
result, info = analyzer.analyze(pypi_package_json)
28+
29+
assert result == HeuristicResult.PASS
30+
assert info == {"message": "Package has sufficient content"}
31+
pypi_package_json.download_sourcecode.assert_called_once()
32+
33+
34+
def test_analyze_exactly_threshold_files_pass(analyzer: MinimalContentAnalyzer, pypi_package_json: MagicMock) -> None:
35+
"""Test the analyzer passes when the package has exactly the threshold number of files."""
36+
pypi_package_json.download_sourcecode.return_value = True
37+
pypi_package_json.package_sourcecode_path = "/fake/path"
38+
with patch("os.walk") as mock_walk:
39+
mock_walk.return_value = [("root", [], [f"file{i}.py" for i in range(50)])]
40+
result, info = analyzer.analyze(pypi_package_json)
41+
42+
assert result == HeuristicResult.PASS
43+
assert info == {"message": "Package has sufficient content"}
44+
45+
46+
def test_analyze_insufficient_files_fail(analyzer: MinimalContentAnalyzer, pypi_package_json: MagicMock) -> None:
47+
"""Test the analyzer fails when the package has insufficient files."""
48+
pypi_package_json.download_sourcecode.return_value = True
49+
pypi_package_json.package_sourcecode_path = "/fake/path"
50+
with patch("os.walk") as mock_walk:
51+
mock_walk.return_value = [("root", [], ["file1.py"])]
52+
result, info = analyzer.analyze(pypi_package_json)
53+
54+
assert result == HeuristicResult.FAIL
55+
assert info == {"message": "Not enough files found"}
56+
57+
58+
def test_analyze_no_files_fail(analyzer: MinimalContentAnalyzer, pypi_package_json: MagicMock) -> None:
59+
"""Test the analyzer fails when the package has no files."""
60+
pypi_package_json.download_sourcecode.return_value = True
61+
pypi_package_json.package_sourcecode_path = "/fake/path"
62+
with patch("os.walk") as mock_walk:
63+
mock_walk.return_value = [("root", [], [])]
64+
result, info = analyzer.analyze(pypi_package_json)
65+
66+
assert result == HeuristicResult.FAIL
67+
assert info == {"message": "Not enough files found"}
68+
69+
70+
def test_analyze_download_failed_raises_error(analyzer: MinimalContentAnalyzer, pypi_package_json: MagicMock) -> None:
71+
"""Test the analyzer raises SourceCodeError when source code download fails."""
72+
pypi_package_json.download_sourcecode.return_value = False
73+
74+
with pytest.raises(SourceCodeError) as exc_info:
75+
analyzer.analyze(pypi_package_json)
76+
77+
assert "No source code files have been downloaded" in str(exc_info.value)
78+
pypi_package_json.download_sourcecode.assert_called_once()
79+
80+
81+
@pytest.mark.parametrize(
82+
("file_count", "expected_result"),
83+
[
84+
(0, HeuristicResult.FAIL),
85+
(1, HeuristicResult.FAIL),
86+
(2, HeuristicResult.FAIL),
87+
(55, HeuristicResult.PASS),
88+
(70, HeuristicResult.PASS),
89+
],
90+
)
91+
def test_analyze_various_file_counts(
92+
analyzer: MinimalContentAnalyzer,
93+
pypi_package_json: MagicMock,
94+
file_count: int,
95+
expected_result: HeuristicResult,
96+
monkeypatch: pytest.MonkeyPatch,
97+
) -> None:
98+
"""Test the analyzer with various file counts."""
99+
pypi_package_json.download_sourcecode.return_value = True
100+
pypi_package_json.package_sourcecode_path = "/fake/path"
101+
files = [f"file{i}.py" for i in range(file_count)]
102+
mock_walk = MagicMock(return_value=[("root", [], files)])
103+
monkeypatch.setattr("os.walk", mock_walk)
104+
105+
result, _ = analyzer.analyze(pypi_package_json)
106+
107+
assert result == expected_result
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""Tests for the UnsecureDescriptionAnalyzer heuristic."""
5+
6+
from unittest.mock import MagicMock
7+
8+
import pytest
9+
10+
from macaron.errors import HeuristicAnalyzerValueError
11+
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult
12+
from macaron.malware_analyzer.pypi_heuristics.metadata.unsecure_description import UnsecureDescriptionAnalyzer
13+
14+
15+
@pytest.fixture(name="analyzer")
16+
def analyzer_() -> UnsecureDescriptionAnalyzer:
17+
"""Pytest fixture to create an UnsecureDescriptionAnalyzer instance."""
18+
return UnsecureDescriptionAnalyzer()
19+
20+
21+
def test_analyze_secure_description_pass(analyzer: UnsecureDescriptionAnalyzer, pypi_package_json: MagicMock) -> None:
22+
"""Test the analyzer passes when the package description is secure."""
23+
pypi_package_json.package_json = {"info": {"description": "This is an internal package."}}
24+
result, info = analyzer.analyze(pypi_package_json)
25+
assert result == HeuristicResult.PASS
26+
assert info["message"] == "Package description is secure"
27+
28+
29+
def test_analyze_unsecure_description_fail(analyzer: UnsecureDescriptionAnalyzer, pypi_package_json: MagicMock) -> None:
30+
"""Test the analyzer fails when the package description is unsecure."""
31+
pypi_package_json.package_json = {"info": {"description": "A public utility library."}}
32+
result, info = analyzer.analyze(pypi_package_json)
33+
assert result == HeuristicResult.FAIL
34+
assert info["message"] == "Package description is unsecure"
35+
36+
37+
def test_analyze_no_info_skip(analyzer: UnsecureDescriptionAnalyzer, pypi_package_json: MagicMock) -> None:
38+
"""Test the analyzer raises an error when no package info is found."""
39+
pypi_package_json.package_json = {}
40+
with pytest.raises(HeuristicAnalyzerValueError) as exc_info:
41+
analyzer.analyze(pypi_package_json)
42+
assert "No package info found in metadata" in str(exc_info.value)
43+
44+
45+
@pytest.mark.parametrize(
46+
("metadata", "expected_result"),
47+
[
48+
({"description": "For internal use only"}, HeuristicResult.PASS),
49+
({"summary": "This is a private package"}, HeuristicResult.PASS),
50+
({"description": "A placeholder for a future project"}, HeuristicResult.PASS),
51+
({"summary": "Used for dependency confusion testing"}, HeuristicResult.PASS),
52+
({"description": "A package for security research"}, HeuristicResult.PASS),
53+
({"summary": "This name is reserved for namespace protection"}, HeuristicResult.PASS),
54+
({"description": "This is a stub package"}, HeuristicResult.PASS),
55+
({"description": "A regular package", "summary": "Does regular things"}, HeuristicResult.FAIL),
56+
],
57+
)
58+
def test_analyze_scenarios(
59+
analyzer: UnsecureDescriptionAnalyzer,
60+
pypi_package_json: MagicMock,
61+
metadata: dict,
62+
expected_result: HeuristicResult,
63+
) -> None:
64+
"""Test the analyzer with various metadata scenarios."""
65+
pypi_package_json.package_json = {"info": metadata}
66+
result, _ = analyzer.analyze(pypi_package_json)
67+
assert result == expected_result

0 commit comments

Comments
 (0)