diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/fake_email.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/fake_email.py index ff0509f68..300629ae1 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/fake_email.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/fake_email.py @@ -101,28 +101,40 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes """ package_json = pypi_package_json.package_json if not package_json.get("info", {}): + # Malformed JSON, the "info" field must be present raise HeuristicAnalyzerValueError("No package info available.") author_email = json_extract(package_json, ["info", "author_email"], str) maintainer_email = json_extract(package_json, ["info", "maintainer_email"], str) + # If there is no email information, this heuristic does not apply (note, this is common, so + # not a case of malformed JSON) if not author_email and not maintainer_email: return HeuristicResult.SKIP, {"message": "No author or maintainer email available."} - validated_emails: list[JsonType] = [] - details = ["normalized", "local_part", "domain"] + # non_emails are ones where an email format cannot be extracted + # invalid_emails are ones that are invalid as per validate_email() + # valid_emails are ones that are valid as per validate_email() + detail_info: dict = {"non_emails": [], "invalid_emails": [], "valid_emails": []} + result = HeuristicResult.FAIL for email_field in [author_email, maintainer_email]: - if email_field: - emails = self.get_emails(email_field) - if not emails: - return HeuristicResult.FAIL, {"message": "no emails found in the email field"} + if not email_field: + continue - for email in emails: - email_info = self.is_valid_email(email) - if not email_info: - return HeuristicResult.FAIL, {"invalid_email": email} + emails = self.get_emails(email_field) + if not emails: + detail_info["non_emails"].append(email_field) - validated_emails.append({key: getattr(email_info, key) for key in details}) + for email in emails: + email_info = self.is_valid_email(email) + if email_info is None: + detail_info["invalid_emails"].append(email) + continue - return HeuristicResult.PASS, {"validated_emails": validated_emails} + logger.debug("Email %s normalized to %s", email, email_info.normalized) + detail_info["valid_emails"].append(email_info.normalized) + # Optimistic, so if there exists a valid email, we will pass this heuristic + result = HeuristicResult.PASS + + return result, detail_info diff --git a/tests/integration/cases/email_checking_pypi_malware_analyzer/check_emails.sh b/tests/integration/cases/email_checking_pypi_malware_analyzer/check_emails.sh new file mode 100755 index 000000000..709ed94f0 --- /dev/null +++ b/tests/integration/cases/email_checking_pypi_malware_analyzer/check_emails.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. +result=$(sqlite3 --json output/macaron.db "SELECT detect_malicious_metadata_check.detail_information + FROM detect_malicious_metadata_check JOIN check_facts on detect_malicious_metadata_check.id = check_facts.id + JOIN check_result on check_facts.check_result_id = check_result.id JOIN component + ON component.id = check_result.component_id WHERE check_result.check_id = 'mcn_detect_malicious_metadata_1' + AND component.name = 'smooth-operator'" | jq -r ".[0].detail_information | fromjson | .invalid_emails | length > 0") +if [[ "$result" == "false" ]]; then + echo "ERROR: the invalid_emails report for smooth-operator is empty" >&2 + exit 1 +fi + +result=$(sqlite3 --json output/macaron.db "SELECT detect_malicious_metadata_check.detail_information + FROM detect_malicious_metadata_check JOIN check_facts on detect_malicious_metadata_check.id = check_facts.id + JOIN check_result on check_facts.check_result_id = check_result.id JOIN component + ON component.id = check_result.component_id WHERE check_result.check_id = 'mcn_detect_malicious_metadata_1' + AND component.name = 'email-validator'" | jq -r ".[0].detail_information | fromjson | .valid_emails | length > 0") +if [[ "$result" == "false" ]]; then + echo "ERROR: the valid_emails report for email-validator is empty" >&2 + exit 2 +fi +exit 0 diff --git a/tests/integration/cases/email_checking_pypi_malware_analyzer/policy.dl b/tests/integration/cases/email_checking_pypi_malware_analyzer/policy.dl new file mode 100644 index 000000000..122be99ee --- /dev/null +++ b/tests/integration/cases/email_checking_pypi_malware_analyzer/policy.dl @@ -0,0 +1,18 @@ +/* Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("check-smooth-operator", component_id, "Check smooth-operator artifacts") :- + check_passed(component_id, "mcn_detect_malicious_metadata_1"). + +apply_policy_to("check-smooth-operator", component_id) :- + is_component(component_id, purl), + match("pkg:pypi/smooth-operator", purl). + +Policy("check-email-validator", component_id, "Check email-validator artifacts") :- + check_passed(component_id, "mcn_detect_malicious_metadata_1"). + +apply_policy_to("check-email-validator", component_id) :- + is_component(component_id, purl), + match("pkg:pypi/email-validator", purl). diff --git a/tests/integration/cases/email_checking_pypi_malware_analyzer/test.yaml b/tests/integration/cases/email_checking_pypi_malware_analyzer/test.yaml new file mode 100644 index 000000000..f2363311f --- /dev/null +++ b/tests/integration/cases/email_checking_pypi_malware_analyzer/test.yaml @@ -0,0 +1,31 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Analyzing two python packages to check if the email address is deliverable and valid. The package smooth-operator is known + to use an invalid email address with an example.com domain. email-validator is known to have a valid, deliverable, email address. + +tags: +- macaron-python-package + +steps: +- name: Run macaron analyze against smooth-operator + kind: analyze + options: + command_args: + - -purl + - pkg:pypi/smooth-operator +- name: Run macaron analyze against email-validator + kind: analyze + options: + command_args: + - -purl + - pkg:pypi/email-validator +- name: Query the output database to verify the emails were detected as invalid and valid respectively. + kind: shell + options: + cmd: ./check_emails.sh +- name: Run macaron verify-policy to check the results of the packages still passed. + kind: verify + options: + policy: policy.dl diff --git a/tests/malware_analyzer/pypi/test_fake_email.py b/tests/malware_analyzer/pypi/test_fake_email.py index f51fb4110..56e81e035 100644 --- a/tests/malware_analyzer/pypi/test_fake_email.py +++ b/tests/malware_analyzer/pypi/test_fake_email.py @@ -4,126 +4,137 @@ """Tests for the FakeEmailAnalyzer heuristic.""" +import os +from pathlib import Path from unittest.mock import MagicMock import pytest +from macaron.config.defaults import load_defaults from macaron.errors import HeuristicAnalyzerValueError from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult from macaron.malware_analyzer.pypi_heuristics.metadata.fake_email import FakeEmailAnalyzer -from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset + +# If check_deliverability is True, DNS-based check is enabled for email_validator. +# If check_deliverability is True and no network is available, email_validator will perform DNS-based check +# but the DNS queries will time out. +# email_validator doesn't treat timeout as a validation failure: +# https://github.com/JoshData/python-email-validator/blob/98800bac023b8713351393a5043034065f1ea6cb/email_validator/deliverability.py#L144 +# Therefore, FakeEmailAnalyzer.is_valid_email doesn't return None, and we will treat this as an "valid email". +# This has a risk of treating an invalid email as valid when an unexpected timeout occurs. +# We have ensured that check_deliverability is always False for all unit test cases in this module, so that behavior +# is not expected to happen. + + +@pytest.fixture(name="fake_email_defaults_override") +def set_defaults_(tmp_path: Path) -> None: + """Disable check_deliverability in defaults.ini so we do not make network connections. + + Parameters + ---------- + tmp_path: Path + Pytest temporary path fixture. + """ + defaults_file = Path(os.path.join(tmp_path, "config.ini")) + content = """ + [heuristic.pypi] + check_deliverability = False + """ + defaults_file.write_text(content, encoding="utf-8") + assert load_defaults(str(defaults_file)) is True @pytest.fixture(name="analyzer") -def analyzer_() -> FakeEmailAnalyzer: +def analyzer_(fake_email_defaults_override: None) -> FakeEmailAnalyzer: # pylint: disable=unused-argument """Pytest fixture to create a FakeEmailAnalyzer instance.""" return FakeEmailAnalyzer() -@pytest.fixture(name="pypi_package_json_asset_mock") -def pypi_package_json_asset_mock_() -> MagicMock: - """Pytest fixture for a mock PyPIPackageJsonAsset.""" - mock_asset = MagicMock(spec=PyPIPackageJsonAsset) - mock_asset.package_json = {} - return mock_asset - - -def test_analyze_skip_no_emails_present(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None: - """Test the analyzer skips if no author_email or maintainer_email is present.""" - pypi_package_json_asset_mock.package_json = {"info": {"author_email": None, "maintainer_email": None}} - result, info = analyzer.analyze(pypi_package_json_asset_mock) +def test_missing_info(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None: + """Test when JSON 'info' key is missing in the PyPI data (should error). + + Parameters + ---------- + pypi_package_json: MagicMock + The PyPIPackageJsonAsset MagicMock fixture. + analyzer: FakeEmailAnalyzer + An initialized FakeEmailAnalyzer instance. + """ + pypi_package_json.package_json = {} # No 'info' key + with pytest.raises(HeuristicAnalyzerValueError): + analyzer.analyze(pypi_package_json) + + +def test_no_emails_present(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None: + """Test when no author_email or maintainer_email is present (should skip). + + Parameters + ---------- + pypi_package_json: MagicMock + The PyPIPackageJsonAsset MagicMock fixture. + analyzer: FakeEmailAnalyzer + An initialized FakeEmailAnalyzer instance. + """ + pypi_package_json.package_json = {"info": {"author_email": None, "maintainer_email": None}} + result, _ = analyzer.analyze(pypi_package_json) assert result == HeuristicResult.SKIP - assert info["message"] == "No author or maintainer email available." - - -def test_analyze_raises_error_for_missing_info_key( - analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock -) -> None: - """Test the analyzer raises an error if the 'info' key is missing in the PyPI data.""" - pypi_package_json_asset_mock.package_json = {} # No 'info' key - with pytest.raises(HeuristicAnalyzerValueError) as exc_info: - analyzer.analyze(pypi_package_json_asset_mock) - assert "No package info available." in str(exc_info.value) - - -def test_analyze_fail_no_email_found_in_field( - analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock -) -> None: - """Test the analyzer fails if an email field does not contain a parsable email address.""" - pypi_package_json_asset_mock.package_json = {"info": {"author_email": "not an email", "maintainer_email": None}} - result, info = analyzer.analyze(pypi_package_json_asset_mock) - assert result == HeuristicResult.FAIL - assert info == {"message": "no emails found in the email field"} -def test_analyze_fail_invalid_email(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None: - """Test analyzer fails if the email field contains an invalid email format.""" - invalid_email = "user@example" - pypi_package_json_asset_mock.package_json = {"info": {"author_email": invalid_email, "maintainer_email": None}} +def test_non_email(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None: + """Test with a non-parsable email address (should fail). - result, info = analyzer.analyze(pypi_package_json_asset_mock) + Parameters + ---------- + pypi_package_json: MagicMock + The PyPIPackageJsonAsset MagicMock fixture. + analyzer: FakeEmailAnalyzer + An initialized FakeEmailAnalyzer instance. + """ + pypi_package_json.package_json = {"info": {"author_email": "not an email", "maintainer_email": "also not an email"}} + result, info = analyzer.analyze(pypi_package_json) assert result == HeuristicResult.FAIL - assert info == {"message": "no emails found in the email field"} - -def test_analyze_pass_only_maintainer_email_valid( - analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock -) -> None: - """Test the analyzer passes if only a valid maintainer_email is present and deliverability is not checked.""" - email = "maintainer@example.net" - pypi_package_json_asset_mock.package_json = {"info": {"author_email": None, "maintainer_email": email}} - result, info = analyzer.analyze(pypi_package_json_asset_mock) + # assert types (for mypy) + assert isinstance(info["non_emails"], list) - if analyzer.check_deliverability: - assert result == HeuristicResult.FAIL - assert info == {"invalid_email": email} - return - - assert result == HeuristicResult.PASS - assert info["validated_emails"] == [ - {"normalized": "maintainer@example.net", "local_part": "maintainer", "domain": "example.net"} - ] + assert "not an email" in info["non_emails"] + assert "also not an email" in info["non_emails"] -def test_analyze_pass_both_emails_valid(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None: - """Test the analyzer passes if both emails are valid and deliverability is not checked.""" - author_email = "example@gmail.com" - author_local_part, author_domain = author_email.split("@") - maintainer_email = "maintainer@example.net" - maintainer_local_part, maintainer_domain = maintainer_email.split("@") +def test_valid_email(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None: + """Test with valid email address format (should pass). - pypi_package_json_asset_mock.package_json = { - "info": {"author_email": author_email, "maintainer_email": maintainer_email} + Parameters + ---------- + pypi_package_json: MagicMock + The PyPIPackageJsonAsset MagicMock fixture. + analyzer: FakeEmailAnalyzer + An initialized FakeEmailAnalyzer instance. + """ + pypi_package_json.package_json = { + "info": { + "author_email": "test.email.zwiusiubwq@gmail.com", + "maintainer_email": "test.email.fqnmwbsxca@gmail.com", + } } - result, info = analyzer.analyze(pypi_package_json_asset_mock) - if analyzer.check_deliverability: - assert result == HeuristicResult.FAIL - assert info == {"invalid_email": maintainer_email} - return - + result, info = analyzer.analyze(pypi_package_json) assert result == HeuristicResult.PASS - validated_emails = info.get("validated_emails") - assert isinstance(validated_emails, list) - assert len(validated_emails) == 2 - assert {"normalized": author_email, "local_part": author_local_part, "domain": author_domain} in validated_emails - assert { - "normalized": maintainer_email, - "local_part": maintainer_local_part, - "domain": maintainer_domain, - } in validated_emails - + # assert types (for mypy) + assert isinstance(info["valid_emails"], list) -def test_is_valid_email_failure(analyzer: FakeEmailAnalyzer) -> None: - """Test is_valid_email returns None on failure.""" - result = analyzer.is_valid_email("invalid-email") - assert result is None + assert "test.email.zwiusiubwq@gmail.com" in info["valid_emails"] + assert "test.email.fqnmwbsxca@gmail.com" in info["valid_emails"] def test_get_emails(analyzer: FakeEmailAnalyzer) -> None: - """Test the get_emails method.""" - email_field = "test@example.com, another test " - expected = ["test@example.com", "another@example.org"] + """Test the get_emails method extracts emails from text correctly. + + analyzer: FakeEmailAnalyzer + An initialized FakeEmailAnalyzer instance. + """ + email_field = "test@example.com, Another User , please also email me@example.net thanks!" + expected = ["test@example.com", "anotheruser@example.org", "me@example.net"] assert analyzer.get_emails(email_field) == expected email_field_no_email = "this is not an email"