Skip to content

test: refactor fake email heuristic to be more optimistic and allow offline tests #1154

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Aug 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -101,28 +101,40 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
"""
package_json = pypi_package_json.package_json
if not package_json.get("info", {}):
# Malformed JSON, the "info" field must be present
raise HeuristicAnalyzerValueError("No package info available.")

author_email = json_extract(package_json, ["info", "author_email"], str)
maintainer_email = json_extract(package_json, ["info", "maintainer_email"], str)

# If there is no email information, this heuristic does not apply (note, this is common, so
# not a case of malformed JSON)
if not author_email and not maintainer_email:
return HeuristicResult.SKIP, {"message": "No author or maintainer email available."}

validated_emails: list[JsonType] = []
details = ["normalized", "local_part", "domain"]
# non_emails are ones where an email format cannot be extracted
# invalid_emails are ones that are invalid as per validate_email()
# valid_emails are ones that are valid as per validate_email()
detail_info: dict = {"non_emails": [], "invalid_emails": [], "valid_emails": []}
result = HeuristicResult.FAIL

for email_field in [author_email, maintainer_email]:
if email_field:
emails = self.get_emails(email_field)
if not emails:
return HeuristicResult.FAIL, {"message": "no emails found in the email field"}
if not email_field:
continue

for email in emails:
email_info = self.is_valid_email(email)
if not email_info:
return HeuristicResult.FAIL, {"invalid_email": email}
emails = self.get_emails(email_field)
if not emails:
detail_info["non_emails"].append(email_field)

validated_emails.append({key: getattr(email_info, key) for key in details})
for email in emails:
email_info = self.is_valid_email(email)
if email_info is None:
detail_info["invalid_emails"].append(email)
continue

return HeuristicResult.PASS, {"validated_emails": validated_emails}
logger.debug("Email %s normalized to %s", email, email_info.normalized)
detail_info["valid_emails"].append(email_info.normalized)
# Optimistic, so if there exists a valid email, we will pass this heuristic
result = HeuristicResult.PASS

return result, detail_info
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash
# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
result=$(sqlite3 --json output/macaron.db "SELECT detect_malicious_metadata_check.detail_information
FROM detect_malicious_metadata_check JOIN check_facts on detect_malicious_metadata_check.id = check_facts.id
JOIN check_result on check_facts.check_result_id = check_result.id JOIN component
ON component.id = check_result.component_id WHERE check_result.check_id = 'mcn_detect_malicious_metadata_1'
AND component.name = 'smooth-operator'" | jq -r ".[0].detail_information | fromjson | .invalid_emails | length > 0")
if [[ "$result" == "false" ]]; then
echo "ERROR: the invalid_emails report for smooth-operator is empty" >&2
exit 1
fi

result=$(sqlite3 --json output/macaron.db "SELECT detect_malicious_metadata_check.detail_information
FROM detect_malicious_metadata_check JOIN check_facts on detect_malicious_metadata_check.id = check_facts.id
JOIN check_result on check_facts.check_result_id = check_result.id JOIN component
ON component.id = check_result.component_id WHERE check_result.check_id = 'mcn_detect_malicious_metadata_1'
AND component.name = 'email-validator'" | jq -r ".[0].detail_information | fromjson | .valid_emails | length > 0")
if [[ "$result" == "false" ]]; then
echo "ERROR: the valid_emails report for email-validator is empty" >&2
exit 2
fi
exit 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/* Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. */
/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */

#include "prelude.dl"

Policy("check-smooth-operator", component_id, "Check smooth-operator artifacts") :-
check_passed(component_id, "mcn_detect_malicious_metadata_1").

apply_policy_to("check-smooth-operator", component_id) :-
is_component(component_id, purl),
match("pkg:pypi/smooth-operator", purl).

Policy("check-email-validator", component_id, "Check email-validator artifacts") :-
check_passed(component_id, "mcn_detect_malicious_metadata_1").

apply_policy_to("check-email-validator", component_id) :-
is_component(component_id, purl),
match("pkg:pypi/email-validator", purl).
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

description: |
Analyzing two python packages to check if the email address is deliverable and valid. The package smooth-operator is known
to use an invalid email address with an example.com domain. email-validator is known to have a valid, deliverable, email address.
tags:
- macaron-python-package

steps:
- name: Run macaron analyze against smooth-operator
kind: analyze
options:
command_args:
- -purl
- pkg:pypi/smooth-operator
- name: Run macaron analyze against email-validator
kind: analyze
options:
command_args:
- -purl
- pkg:pypi/email-validator
- name: Query the output database to verify the emails were detected as invalid and valid respectively.
kind: shell
options:
cmd: ./check_emails.sh
- name: Run macaron verify-policy to check the results of the packages still passed.
kind: verify
options:
policy: policy.dl
189 changes: 100 additions & 89 deletions tests/malware_analyzer/pypi/test_fake_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,126 +4,137 @@
"""Tests for the FakeEmailAnalyzer heuristic."""


import os
from pathlib import Path
from unittest.mock import MagicMock

import pytest

from macaron.config.defaults import load_defaults
from macaron.errors import HeuristicAnalyzerValueError
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult
from macaron.malware_analyzer.pypi_heuristics.metadata.fake_email import FakeEmailAnalyzer
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset

# If check_deliverability is True, DNS-based check is enabled for email_validator.
# If check_deliverability is True and no network is available, email_validator will perform DNS-based check
# but the DNS queries will time out.
# email_validator doesn't treat timeout as a validation failure:
# https://github.com/JoshData/python-email-validator/blob/98800bac023b8713351393a5043034065f1ea6cb/email_validator/deliverability.py#L144
# Therefore, FakeEmailAnalyzer.is_valid_email doesn't return None, and we will treat this as an "valid email".
# This has a risk of treating an invalid email as valid when an unexpected timeout occurs.
# We have ensured that check_deliverability is always False for all unit test cases in this module, so that behavior
# is not expected to happen.


@pytest.fixture(name="fake_email_defaults_override")
def set_defaults_(tmp_path: Path) -> None:
"""Disable check_deliverability in defaults.ini so we do not make network connections.
Parameters
----------
tmp_path: Path
Pytest temporary path fixture.
"""
defaults_file = Path(os.path.join(tmp_path, "config.ini"))
content = """
[heuristic.pypi]
check_deliverability = False
"""
defaults_file.write_text(content, encoding="utf-8")
assert load_defaults(str(defaults_file)) is True


@pytest.fixture(name="analyzer")
def analyzer_() -> FakeEmailAnalyzer:
def analyzer_(fake_email_defaults_override: None) -> FakeEmailAnalyzer: # pylint: disable=unused-argument
"""Pytest fixture to create a FakeEmailAnalyzer instance."""
return FakeEmailAnalyzer()


@pytest.fixture(name="pypi_package_json_asset_mock")
def pypi_package_json_asset_mock_() -> MagicMock:
"""Pytest fixture for a mock PyPIPackageJsonAsset."""
mock_asset = MagicMock(spec=PyPIPackageJsonAsset)
mock_asset.package_json = {}
return mock_asset


def test_analyze_skip_no_emails_present(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None:
"""Test the analyzer skips if no author_email or maintainer_email is present."""
pypi_package_json_asset_mock.package_json = {"info": {"author_email": None, "maintainer_email": None}}
result, info = analyzer.analyze(pypi_package_json_asset_mock)
def test_missing_info(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None:
"""Test when JSON 'info' key is missing in the PyPI data (should error).
Parameters
----------
pypi_package_json: MagicMock
The PyPIPackageJsonAsset MagicMock fixture.
analyzer: FakeEmailAnalyzer
An initialized FakeEmailAnalyzer instance.
"""
pypi_package_json.package_json = {} # No 'info' key
with pytest.raises(HeuristicAnalyzerValueError):
analyzer.analyze(pypi_package_json)


def test_no_emails_present(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None:
"""Test when no author_email or maintainer_email is present (should skip).
Parameters
----------
pypi_package_json: MagicMock
The PyPIPackageJsonAsset MagicMock fixture.
analyzer: FakeEmailAnalyzer
An initialized FakeEmailAnalyzer instance.
"""
pypi_package_json.package_json = {"info": {"author_email": None, "maintainer_email": None}}
result, _ = analyzer.analyze(pypi_package_json)
assert result == HeuristicResult.SKIP
assert info["message"] == "No author or maintainer email available."


def test_analyze_raises_error_for_missing_info_key(
analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock
) -> None:
"""Test the analyzer raises an error if the 'info' key is missing in the PyPI data."""
pypi_package_json_asset_mock.package_json = {} # No 'info' key
with pytest.raises(HeuristicAnalyzerValueError) as exc_info:
analyzer.analyze(pypi_package_json_asset_mock)
assert "No package info available." in str(exc_info.value)


def test_analyze_fail_no_email_found_in_field(
analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock
) -> None:
"""Test the analyzer fails if an email field does not contain a parsable email address."""
pypi_package_json_asset_mock.package_json = {"info": {"author_email": "not an email", "maintainer_email": None}}
result, info = analyzer.analyze(pypi_package_json_asset_mock)
assert result == HeuristicResult.FAIL
assert info == {"message": "no emails found in the email field"}


def test_analyze_fail_invalid_email(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None:
"""Test analyzer fails if the email field contains an invalid email format."""
invalid_email = "user@example"
pypi_package_json_asset_mock.package_json = {"info": {"author_email": invalid_email, "maintainer_email": None}}
def test_non_email(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None:
"""Test with a non-parsable email address (should fail).
result, info = analyzer.analyze(pypi_package_json_asset_mock)
Parameters
----------
pypi_package_json: MagicMock
The PyPIPackageJsonAsset MagicMock fixture.
analyzer: FakeEmailAnalyzer
An initialized FakeEmailAnalyzer instance.
"""
pypi_package_json.package_json = {"info": {"author_email": "not an email", "maintainer_email": "also not an email"}}
result, info = analyzer.analyze(pypi_package_json)
assert result == HeuristicResult.FAIL
assert info == {"message": "no emails found in the email field"}


def test_analyze_pass_only_maintainer_email_valid(
analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock
) -> None:
"""Test the analyzer passes if only a valid maintainer_email is present and deliverability is not checked."""
email = "maintainer@example.net"
pypi_package_json_asset_mock.package_json = {"info": {"author_email": None, "maintainer_email": email}}
result, info = analyzer.analyze(pypi_package_json_asset_mock)
# assert types (for mypy)
assert isinstance(info["non_emails"], list)

if analyzer.check_deliverability:
assert result == HeuristicResult.FAIL
assert info == {"invalid_email": email}
return

assert result == HeuristicResult.PASS
assert info["validated_emails"] == [
{"normalized": "maintainer@example.net", "local_part": "maintainer", "domain": "example.net"}
]
assert "not an email" in info["non_emails"]
assert "also not an email" in info["non_emails"]


def test_analyze_pass_both_emails_valid(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None:
"""Test the analyzer passes if both emails are valid and deliverability is not checked."""
author_email = "example@gmail.com"
author_local_part, author_domain = author_email.split("@")
maintainer_email = "maintainer@example.net"
maintainer_local_part, maintainer_domain = maintainer_email.split("@")
def test_valid_email(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None:
"""Test with valid email address format (should pass).
pypi_package_json_asset_mock.package_json = {
"info": {"author_email": author_email, "maintainer_email": maintainer_email}
Parameters
----------
pypi_package_json: MagicMock
The PyPIPackageJsonAsset MagicMock fixture.
analyzer: FakeEmailAnalyzer
An initialized FakeEmailAnalyzer instance.
"""
pypi_package_json.package_json = {
"info": {
"author_email": "test.email.zwiusiubwq@gmail.com",
"maintainer_email": "test.email.fqnmwbsxca@gmail.com",
}
}
result, info = analyzer.analyze(pypi_package_json_asset_mock)
if analyzer.check_deliverability:
assert result == HeuristicResult.FAIL
assert info == {"invalid_email": maintainer_email}
return

result, info = analyzer.analyze(pypi_package_json)
assert result == HeuristicResult.PASS

validated_emails = info.get("validated_emails")
assert isinstance(validated_emails, list)
assert len(validated_emails) == 2
assert {"normalized": author_email, "local_part": author_local_part, "domain": author_domain} in validated_emails
assert {
"normalized": maintainer_email,
"local_part": maintainer_local_part,
"domain": maintainer_domain,
} in validated_emails

# assert types (for mypy)
assert isinstance(info["valid_emails"], list)

def test_is_valid_email_failure(analyzer: FakeEmailAnalyzer) -> None:
"""Test is_valid_email returns None on failure."""
result = analyzer.is_valid_email("invalid-email")
assert result is None
assert "test.email.zwiusiubwq@gmail.com" in info["valid_emails"]
assert "test.email.fqnmwbsxca@gmail.com" in info["valid_emails"]


def test_get_emails(analyzer: FakeEmailAnalyzer) -> None:
"""Test the get_emails method."""
email_field = "test@example.com, another test <another@example.org>"
expected = ["test@example.com", "another@example.org"]
"""Test the get_emails method extracts emails from text correctly.
analyzer: FakeEmailAnalyzer
An initialized FakeEmailAnalyzer instance.
"""
email_field = "test@example.com, Another User <anotheruser@example.org>, please also email me@example.net thanks!"
expected = ["test@example.com", "anotheruser@example.org", "me@example.net"]
assert analyzer.get_emails(email_field) == expected

email_field_no_email = "this is not an email"
Expand Down
Loading