Skip to content

Commit c323ddd

Browse files
authored
feat(heuristics): add Fake Email analyzer to validate maintainer email domain (#1106)
This PR adds a new heuristic analyzer called FakeEmailAnalyzer. It verifies the validity of maintainer email addresses listed in a PyPI package by checking both the format and the existence of records for their domains. This helps detect packages with fake or throwaway emails, which are often indicative of malicious intent. Signed-off-by: Amine <amine.raouane@enim.ac.ma>
1 parent 97f2cd0 commit c323ddd

File tree

12 files changed

+293
-8
lines changed

12 files changed

+293
-8
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ dependencies = [
3838
"problog >= 2.2.6,<3.0.0",
3939
"cryptography >=44.0.0,<45.0.0",
4040
"semgrep == 1.113.0",
41+
"email-validator >=2.2.0,<3.0.0",
4142
]
4243
keywords = []
4344
# https://pypi.org/classifiers/

src/macaron/config/defaults.ini

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -612,6 +612,9 @@ cost = 1.0
612612
# The path to the file that contains the list of popular packages.
613613
popular_packages_path =
614614

615+
# A boolean value that determines whether to check the deliverability of the email address.
616+
check_deliverability = True
617+
615618
# ==== The following sections are for source code analysis using Semgrep ====
616619
# rulesets: a reference to a 'ruleset' in this section refers to a Semgrep .yaml file containing one or more rules.
617620
# rules: a reference to a 'rule' in this section refers to an individual rule ID, specified by the '- id:' field in

src/macaron/malware_analyzer/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,12 @@ When a heuristic fails, with `HeuristicResult.FAIL`, then that is an indicator b
6464
> ./scripts/find_packages.sh
6565
> ```
6666
> The script will download the top 5000 PyPI packages and update the resource file automatically.
67+
68+
11. **Fake Email**
69+
- **Description**: Checks if the package maintainer or author has a suspicious or invalid email.
70+
- **Rule**: Return `HeuristicResult.FAIL` if the email is invalid; otherwise, return `HeuristicResult.PASS`.
71+
- **Dependency**: None.
72+
6773
### Source Code Analysis with Semgrep
6874
**PyPI Source Code Analyzer**
6975
- **Description**: Uses Semgrep, with default rules written in `src/macaron/resources/pypi_malware_rules` and custom rules available by supplying a path to `custom_semgrep_rules` in `defaults.ini`, to scan the package `.tar` source code.

src/macaron/malware_analyzer/pypi_heuristics/heuristics.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ class Heuristics(str, Enum):
4343
#: Indicates that the package source code contains suspicious code patterns.
4444
SUSPICIOUS_PATTERNS = "suspicious_patterns"
4545

46+
#: Indicates that the package maintainer's email address is suspicious or invalid.
47+
FAKE_EMAIL = "fake_email"
48+
4649

4750
class HeuristicResult(str, Enum):
4851
"""Result type indicating the outcome of a heuristic."""
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""The heuristic analyzer to check the email address of the package maintainers."""
5+
6+
import logging
7+
import re
8+
9+
from email_validator import EmailNotValidError, ValidatedEmail, validate_email
10+
11+
from macaron.config.defaults import defaults
12+
from macaron.errors import HeuristicAnalyzerValueError
13+
from macaron.json_tools import JsonType, json_extract
14+
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
15+
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
16+
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
17+
18+
logger: logging.Logger = logging.getLogger(__name__)
19+
20+
21+
class FakeEmailAnalyzer(BaseHeuristicAnalyzer):
22+
"""Analyze the email address of the package maintainers."""
23+
24+
PATTERN = re.compile(
25+
r"""\b # word‑boundary
26+
[A-Za-z0-9]+ # first alpha‑numeric segment
27+
(?:\.[A-Za-z0-9]+)* # optional “.segment” repeats
28+
@
29+
[A-Za-z0-9]+ # domain name segment
30+
(?:\.[A-Za-z0-9]+)* # optional sub‑domains
31+
\.[A-Za-z]{2,} # top‑level domain (at least 2 letters)
32+
\b""",
33+
re.VERBOSE,
34+
)
35+
36+
def __init__(self) -> None:
37+
super().__init__(
38+
name="fake_email_analyzer",
39+
heuristic=Heuristics.FAKE_EMAIL,
40+
depends_on=None,
41+
)
42+
self.check_deliverability: bool = self._load_defaults()
43+
44+
def _load_defaults(self) -> bool:
45+
"""Load the default values from defaults.ini."""
46+
section_name = "heuristic.pypi"
47+
if defaults.has_section(section_name):
48+
section = defaults[section_name]
49+
return section.getboolean("check_deliverability", fallback=True)
50+
return True
51+
52+
def get_emails(self, email_field: str) -> list[str]:
53+
"""Extract emails from the given email field.
54+
55+
Parameters
56+
----------
57+
email_field: str
58+
The email field from which to extract emails.
59+
60+
Returns
61+
-------
62+
list[str]
63+
A list of emails extracted from the email field.
64+
"""
65+
emails = self.PATTERN.findall(email_field)
66+
return [email.strip() for email in emails if email.strip()]
67+
68+
def is_valid_email(self, email: str) -> ValidatedEmail | None:
69+
"""Check if the email format is valid and the domain has MX records.
70+
71+
Parameters
72+
----------
73+
email: str
74+
The email address to check.
75+
76+
Returns
77+
-------
78+
ValidatedEmail | None
79+
The validated email object if the email is valid, otherwise None.
80+
"""
81+
emailinfo = None
82+
try:
83+
emailinfo = validate_email(email, check_deliverability=self.check_deliverability)
84+
except EmailNotValidError as err:
85+
err_message = f"Invalid email address: {email}. Error: {err}"
86+
logger.warning(err_message)
87+
return emailinfo
88+
89+
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
90+
"""Analyze the package.
91+
92+
Parameters
93+
----------
94+
pypi_package_json: PyPIPackageJsonAsset
95+
The PyPI package JSON asset object.
96+
97+
Returns
98+
-------
99+
tuple[HeuristicResult, dict[str, JsonType]]:
100+
The result and related information collected during the analysis.
101+
"""
102+
package_json = pypi_package_json.package_json
103+
if not package_json.get("info", {}):
104+
raise HeuristicAnalyzerValueError("No package info available.")
105+
106+
author_email = json_extract(package_json, ["info", "author_email"], str)
107+
maintainer_email = json_extract(package_json, ["info", "maintainer_email"], str)
108+
109+
if not author_email and not maintainer_email:
110+
return HeuristicResult.SKIP, {"message": "No author or maintainer email available."}
111+
112+
validated_emails: list[JsonType] = []
113+
details = ["normalized", "local_part", "domain"]
114+
115+
for email_field in [author_email, maintainer_email]:
116+
if email_field:
117+
emails = self.get_emails(email_field)
118+
if not emails:
119+
return HeuristicResult.FAIL, {"message": "no emails found in the email field"}
120+
121+
for email in emails:
122+
email_info = self.is_valid_email(email)
123+
if not email_info:
124+
return HeuristicResult.FAIL, {"invalid_email": email}
125+
126+
validated_emails.append({key: getattr(email_info, key) for key in details})
127+
128+
return HeuristicResult.PASS, {"validated_emails": validated_emails}

src/macaron/slsa_analyzer/build_tool/gradle.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved.
22
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
33

44
"""This module contains the Gradle class which inherits BaseBuildTool.
@@ -122,7 +122,7 @@ def get_dep_analyzer(self) -> CycloneDxGradle:
122122
raise DependencyAnalyzerError("No default dependency analyzer is found.")
123123
if not DependencyAnalyzer.tool_valid(defaults.get("dependency.resolver", "dep_tool_gradle")):
124124
raise DependencyAnalyzerError(
125-
f"Dependency analyzer {defaults.get('dependency.resolver','dep_tool_gradle')} is not valid.",
125+
f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.",
126126
)
127127

128128
tool_name, tool_version = tuple(

src/macaron/slsa_analyzer/build_tool/maven.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved.
22
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
33

44
"""This module contains the Maven class which inherits BaseBuildTool.
@@ -116,7 +116,7 @@ def get_dep_analyzer(self) -> CycloneDxMaven:
116116
raise DependencyAnalyzerError("No default dependency analyzer is found.")
117117
if not DependencyAnalyzer.tool_valid(defaults.get("dependency.resolver", "dep_tool_maven")):
118118
raise DependencyAnalyzerError(
119-
f"Dependency analyzer {defaults.get('dependency.resolver','dep_tool_maven')} is not valid.",
119+
f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_maven')} is not valid.",
120120
)
121121

122122
tool_name, tool_version = tuple(

src/macaron/slsa_analyzer/build_tool/pip.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved.
22
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
33

44
"""This module contains the Pip class which inherits BaseBuildTool.
@@ -88,7 +88,7 @@ def get_dep_analyzer(self) -> DependencyAnalyzer:
8888
tool_name = "cyclonedx_py"
8989
if not DependencyAnalyzer.tool_valid(f"{tool_name}:{cyclonedx_version}"):
9090
raise DependencyAnalyzerError(
91-
f"Dependency analyzer {defaults.get('dependency.resolver','dep_tool_gradle')} is not valid.",
91+
f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.",
9292
)
9393
return CycloneDxPython(
9494
resources_path=global_config.resources_path,

src/macaron/slsa_analyzer/build_tool/poetry.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved.
22
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
33

44
"""This module contains the Poetry class which inherits BaseBuildTool.
@@ -126,7 +126,7 @@ def get_dep_analyzer(self) -> DependencyAnalyzer:
126126
tool_name = "cyclonedx_py"
127127
if not DependencyAnalyzer.tool_valid(f"{tool_name}:{cyclonedx_version}"):
128128
raise DependencyAnalyzerError(
129-
f"Dependency analyzer {defaults.get('dependency.resolver','dep_tool_gradle')} is not valid.",
129+
f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.",
130130
)
131131
return CycloneDxPython(
132132
resources_path=global_config.resources_path,

src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from macaron.malware_analyzer.pypi_heuristics.metadata.anomalous_version import AnomalousVersionAnalyzer
2121
from macaron.malware_analyzer.pypi_heuristics.metadata.closer_release_join_date import CloserReleaseJoinDateAnalyzer
2222
from macaron.malware_analyzer.pypi_heuristics.metadata.empty_project_link import EmptyProjectLinkAnalyzer
23+
from macaron.malware_analyzer.pypi_heuristics.metadata.fake_email import FakeEmailAnalyzer
2324
from macaron.malware_analyzer.pypi_heuristics.metadata.high_release_frequency import HighReleaseFrequencyAnalyzer
2425
from macaron.malware_analyzer.pypi_heuristics.metadata.one_release import OneReleaseAnalyzer
2526
from macaron.malware_analyzer.pypi_heuristics.metadata.source_code_repo import SourceCodeRepoAnalyzer
@@ -358,6 +359,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
358359
WheelAbsenceAnalyzer,
359360
AnomalousVersionAnalyzer,
360361
TyposquattingPresenceAnalyzer,
362+
FakeEmailAnalyzer,
361363
]
362364

363365
# name used to query the result of all problog rules, so it can be accessed outside the model.
@@ -425,13 +427,18 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
425427
failed({Heuristics.ONE_RELEASE.value}),
426428
failed({Heuristics.ANOMALOUS_VERSION.value}).
427429
430+
% Package released recently with the a maintainer email address that is not valid.
431+
{Confidence.MEDIUM.value}::trigger(malware_medium_confidence_3) :-
432+
quickUndetailed,
433+
failed({Heuristics.FAKE_EMAIL.value}).
428434
% ----- Evaluation -----
429435
430436
% Aggregate result
431437
{problog_result_access} :- trigger(malware_high_confidence_1).
432438
{problog_result_access} :- trigger(malware_high_confidence_2).
433439
{problog_result_access} :- trigger(malware_high_confidence_3).
434440
{problog_result_access} :- trigger(malware_high_confidence_4).
441+
{problog_result_access} :- trigger(malware_medium_confidence_3).
435442
{problog_result_access} :- trigger(malware_medium_confidence_2).
436443
{problog_result_access} :- trigger(malware_medium_confidence_1).
437444
query({problog_result_access}).

0 commit comments

Comments
 (0)