diff --git a/src/macaron/provenance/provenance_finder.py b/src/macaron/provenance/provenance_finder.py index 715204a16..3ea4d9537 100644 --- a/src/macaron/provenance/provenance_finder.py +++ b/src/macaron/provenance/provenance_finder.py @@ -15,6 +15,7 @@ from macaron.config.defaults import defaults from macaron.repo_finder.commit_finder import AbstractPurlType, determine_abstract_purl_type from macaron.repo_finder.repo_finder_deps_dev import DepsDevRepoFinder +from macaron.repo_finder.repo_utils import get_repo_tags from macaron.slsa_analyzer.analyze_context import AnalyzeContext from macaron.slsa_analyzer.checks.provenance_available_check import ProvenanceAvailableException from macaron.slsa_analyzer.ci_service import GitHubActions @@ -378,15 +379,10 @@ def find_provenance_from_ci( if not digest: logger.debug("Cannot retrieve asset provenance without commit digest.") return None - tags = git_obj.repo.tags - for _tag in tags: - try: - tag_commit = str(_tag.commit) - except ValueError as error: - logger.debug("Commit of tag is a blob or tree: %s", error) - continue - if tag_commit and tag_commit == digest: - tag = str(_tag) + tags = get_repo_tags(git_obj) + for key, value in tags.items(): + if value == digest: + tag = key break if not tag: diff --git a/src/macaron/repo_finder/commit_finder.py b/src/macaron/repo_finder/commit_finder.py index 5e443e56b..5398a8d23 100644 --- a/src/macaron/repo_finder/commit_finder.py +++ b/src/macaron/repo_finder/commit_finder.py @@ -7,13 +7,13 @@ from enum import Enum from re import Pattern -from git import TagReference from gitdb.exc import BadName from packageurl import PackageURL from pydriller import Commit, Git from macaron.repo_finder import repo_finder_deps_dev, to_domain_from_known_purl_types from macaron.repo_finder.repo_finder_enums import CommitFinderInfo +from macaron.repo_finder.repo_utils import get_repo_tags from macaron.slsa_analyzer.git_service import GIT_SERVICES logger: logging.Logger = logging.getLogger(__name__) @@ -247,27 +247,13 @@ def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) -> logger.debug("Searching for commit of artifact version using tags: %s@%s", name, version) # Only consider tags that have a commit. - repo_tags = git_obj.repo.tags + repo_tags = get_repo_tags(git_obj) if not repo_tags: logger.debug("No tags found for %s", name) return None, CommitFinderInfo.NO_TAGS - valid_tags = {} - for tag in repo_tags: - commit = _get_tag_commit(tag) - if not commit: - logger.debug("No commit found for tag: %s", tag) - continue - - tag_name = str(tag) - valid_tags[tag_name] = tag - - if not valid_tags: - logger.debug("No tags with commits found for %s", name) - return None, CommitFinderInfo.NO_TAGS_WITH_COMMITS - # Match tags. - matched_tags, outcome = match_tags(list(valid_tags.keys()), name, version) + matched_tags, outcome = match_tags(list(repo_tags.keys()), name, version) if not matched_tags: logger.debug("No tags matched for %s", name) @@ -279,25 +265,21 @@ def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) -> logger.debug("Up to 5 others: %s", matched_tags[1:6]) tag_name = matched_tags[0] - tag = valid_tags[tag_name] - if not tag: - # Tag names are taken from valid_tags and should always exist within it. - logger.debug("Missing tag name from tag dict: %s not in %s", tag_name, valid_tags.keys()) - - try: - hexsha = tag.commit.hexsha - except ValueError: - logger.debug("Error trying to retrieve digest of commit: %s", tag.commit) - return None, CommitFinderInfo.NO_TAG_COMMIT + commit = None + if tag_name not in repo_tags: + # Tag names are taken from repo_tags and should always exist within it. + logger.debug("Missing tag name from tag dict: %s not in %s", tag_name, repo_tags.keys()) + else: + commit = repo_tags[tag_name] logger.debug( "Found tag %s with commit %s for artifact version %s@%s", - tag, - hexsha, + tag_name, + commit, name, version, ) - return hexsha if hexsha else None, CommitFinderInfo.MATCHED + return commit if commit else None, CommitFinderInfo.MATCHED def _split_name(name: str) -> list[str]: @@ -907,15 +889,3 @@ def _create_suffix_tag_comparison_pattern(tag_part: str) -> Pattern | None: # Combine the alphabetic and zero-extended numeric parts. return re.compile(f"{versioned_string_result.group(1)}(0*){versioned_string_result.group(3)}", re.IGNORECASE) - - -def _get_tag_commit(tag: TagReference) -> Commit | None: - """Return the commit of the passed tag. - - This is a standalone function to more clearly handle the potential error raised by accessing the tag's commit - property. - """ - try: - return tag.commit - except ValueError: - return None diff --git a/src/macaron/repo_finder/repo_finder_enums.py b/src/macaron/repo_finder/repo_finder_enums.py index 87c258491..9d74e3d42 100644 --- a/src/macaron/repo_finder/repo_finder_enums.py +++ b/src/macaron/repo_finder/repo_finder_enums.py @@ -108,9 +108,6 @@ class CommitFinderInfo(Enum): #: Reported if the repository has no Git tags. NO_TAGS = "No Git tags" - #: Reported if the repository has no Git tags with associated commits. - NO_TAGS_WITH_COMMITS = "No Git tags with commits" - #: Reported if the tag selected from the repository fails to resolve to a commit despite having one associated with # it. NO_TAG_COMMIT = "No valid commit found for Git tag" diff --git a/src/macaron/repo_finder/repo_utils.py b/src/macaron/repo_finder/repo_utils.py index e1b0be7af..f246b98a0 100644 --- a/src/macaron/repo_finder/repo_utils.py +++ b/src/macaron/repo_finder/repo_utils.py @@ -6,14 +6,16 @@ import logging import os import string +import subprocess # nosec B404 from urllib.parse import urlparse from packageurl import PackageURL +from pydriller import Git from macaron.config.global_config import global_config from macaron.slsa_analyzer.git_service import GIT_SERVICES, BaseGitService from macaron.slsa_analyzer.git_service.base_git_service import NoneGitService -from macaron.slsa_analyzer.git_url import GIT_REPOS_DIR +from macaron.slsa_analyzer.git_url import GIT_REPOS_DIR, decode_git_tags, parse_git_tags logger: logging.Logger = logging.getLogger(__name__) @@ -169,3 +171,63 @@ def check_repo_urls_are_equivalent(repo_1: str, repo_2: str) -> bool: return False return True + + +def get_repo_tags(git_obj: Git) -> dict[str, str]: + """Retrieve the tags of the passed repo. + + This will be attempted using the related Pydriller Git function, but will fall back to a Git subprocess for + repositories that contain non utf-8 tags. + + Parameters + ---------- + git_obj: Git + The Git object of the repository. + + Returns + ------- + dict[str, str] + A dictionary of tags mapped to commits. + """ + tags = None + try: + tags = git_obj.repo.tags + except UnicodeDecodeError as error: + logger.debug("Failed to retrieve tags in utf-8 encoding: %s", error) + + if tags: + tag_dict: dict[str, str] = {} + for tag in tags: + try: + tag_commit = str(tag.commit) + except ValueError as error: + logger.debug("Commit of tag is a blob or tree: %s", error) + continue + tag_dict[tag.name] = tag_commit + return tag_dict + + # Retrieve tags using a Git subprocess. + repository_path = git_obj.repo.working_tree_dir + if not os.path.isdir(repository_path): + logger.debug("") + return {} + try: + result = subprocess.run( # nosec B603 + args=["git", "show-ref", "--tags", "-d"], + capture_output=True, + cwd=repository_path, + check=False, + ) + except (subprocess.CalledProcessError, OSError) as error: + logger.debug("Failed to retrieve repository tags: %s", error) + return {} + + if result.returncode != 0: + logger.debug("Failed to retrieve repository tags.") + return {} + + decoded_data = decode_git_tags(result.stdout) + if not decoded_data: + return {} + + return parse_git_tags(decoded_data) diff --git a/src/macaron/slsa_analyzer/git_url.py b/src/macaron/slsa_analyzer/git_url.py index baadebc97..d5b473685 100644 --- a/src/macaron/slsa_analyzer/git_url.py +++ b/src/macaron/slsa_analyzer/git_url.py @@ -23,6 +23,7 @@ from macaron.config.global_config import global_config from macaron.environment_variables import get_patched_env from macaron.errors import CloneError, GitTagError +from macaron.util import BytesDecoder logger: logging.Logger = logging.getLogger(__name__) @@ -412,7 +413,7 @@ def list_remote_references(arguments: list[str], repo: str) -> str | None: logger.error("Failed to retrieve remote references from repo: %s", repo) return None - return result.stdout.decode("utf-8") + return decode_git_tags(result.stdout) def resolve_local_path(start_dir: str, local_path: str) -> str: @@ -944,7 +945,7 @@ def is_commit_hash(value: str) -> bool: return bool(re.match(pattern, value)) -def get_tags_via_git_remote(repo: str) -> dict[str, str] | None: +def get_tags_via_git_remote(repo: str) -> dict[str, str]: """Retrieve all tags from a given repository using ls-remote. Parameters @@ -955,35 +956,14 @@ def get_tags_via_git_remote(repo: str) -> dict[str, str] | None: Returns ------- dict[str] - A dictionary of tags mapped to their commits, or None if the operation failed.. + A dictionary of tags mapped to their commits. """ tag_data = list_remote_references(["--tags"], repo) if not tag_data: - return None - tags = {} - - for tag_line in tag_data.splitlines(): - tag_line = tag_line.strip() - if not tag_line: - continue - split = tag_line.split("\t") - if len(split) != 2: - continue - possible_tag = split[1] - if possible_tag.endswith("^{}"): - possible_tag = possible_tag[:-3] - elif possible_tag in tags: - # If a tag already exists, it must be the annotated reference of an annotated tag. - # In that case we skip the tag as it does not point to the proper source commit. - # Note that this should only happen if the tags are received out of standard order. - continue - possible_tag = possible_tag.replace("refs/tags/", "") - if not possible_tag: - continue - tags[possible_tag] = split[0] + return {} + tags = parse_git_tags(tag_data) logger.debug("Found %s tags via ls-remote of %s", len(tags), repo) - return tags @@ -1055,3 +1035,61 @@ def find_highest_git_tag(tags: set[str]) -> str: raise GitTagError("No valid version tag found.") return highest_tag + + +def parse_git_tags(tag_data: str) -> dict[str, str]: + """Parse the tags and commits found within the passed data. + + Parameters + ---------- + tag_data: str + The tag data to parse. + + Returns + ------- + dict[str, str] + A dictionary of tags mapped to commits. + """ + tags = {} + for tag_line in tag_data.splitlines(): + tag_line = tag_line.strip() + if not tag_line: + continue + split = re.split("[\t ]", tag_line, maxsplit=1) + if len(split) != 2: + continue + possible_tag = split[1] + if possible_tag.endswith("^{}"): + possible_tag = possible_tag[:-3] + elif possible_tag in tags: + # If a tag already exists, it must be the annotated reference of an annotated tag. + # In that case we skip the tag as it does not point to the proper source commit. + # Note that this should only happen if the tags are received out of standard order. + continue + possible_tag = possible_tag.replace("refs/tags/", "") + if not possible_tag: + continue + tags[possible_tag] = split[0] + + return tags + + +def decode_git_tags(data: bytes) -> str | None: + """Decode the passed Git tag data. + + Parameters + ---------- + data: bytes + The data to decode. + + Returns + ------- + str | None + The decoded data, or None if an error occurred. + """ + try: + return data.decode("utf-8") + except UnicodeDecodeError as error: + logger.debug("Error decoding stdout as utf-8: %s", error) + # Try other character encodings. + return BytesDecoder.decode(data) diff --git a/src/macaron/util.py b/src/macaron/util.py index 96af86991..a5ba8589e 100644 --- a/src/macaron/util.py +++ b/src/macaron/util.py @@ -419,3 +419,49 @@ def copy_file_bulk(file_list: list, src_path: str, target_path: str) -> bool: return False return True + + +class BytesDecoder: + """This class aims to decode some non-UTF8 bytes to a valid string. + + The aim is not to 'correctly' parse the passed data. Only to successfully do so. + It is assumed that an attempt to decode using UTF8 has already failed. + The top 10 most common encodings (after UTF-8) are tried. + """ + + # Taken from https://w3techs.com/technologies/overview/character_encoding. + COMMON_ENCODINGS = [ + "ISO-8859-1", + "cp1252", + "cp1251", + "euc-jp", + "euc-kr", + "shift_jis", + "gb2312", + "cp1250", + "ISO-8859-2", + "big5", + ] + + @staticmethod + def decode(data: bytes) -> str | None: + """Attempt to decode the passed bytes using common encodings. + + Parameters + ---------- + data: bytes + The data to decode. + + Returns + ------- + str | None + The data as a string if successful, or None. + """ + for encoding in BytesDecoder.COMMON_ENCODINGS: + try: + return data.decode(encoding) + except UnicodeDecodeError: + pass + + logger.debug("Failed to decode bytes using most common character encodings.") + return None diff --git a/tests/integration/cases/maven_acra_unusual_tag/acra-core.source.json b/tests/integration/cases/maven_acra_unusual_tag/acra-core.source.json new file mode 100644 index 000000000..e6a6197e2 --- /dev/null +++ b/tests/integration/cases/maven_acra_unusual_tag/acra-core.source.json @@ -0,0 +1,8 @@ +{ + "purl": "pkg:maven/ch.acra/acra-core@5.12.0\u00c3", + "commit": "8d52a1df9fec10b8f2dd95793a3873b49778e011", + "repo": "https://github.com/ACRA/acra", + "repo_validated": false, + "commit_validated": false, + "url": "https://github.com/ACRA/acra/commit/8d52a1df9fec10b8f2dd95793a3873b49778e011" +} diff --git a/tests/integration/cases/maven_acra_unusual_tag/policy.dl b/tests/integration/cases/maven_acra_unusual_tag/policy.dl new file mode 100644 index 000000000..8badb7fef --- /dev/null +++ b/tests/integration/cases/maven_acra_unusual_tag/policy.dl @@ -0,0 +1,10 @@ +/* Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("test_policy", component_id, "") :- + check_passed(component_id, "mcn_build_service_1"). + +apply_policy_to("test_policy", component_id) :- + is_component(component_id, "pkg:maven/ch.acra/acra-core@5.12.0"). diff --git a/tests/integration/cases/maven_acra_unusual_tag/test.yaml b/tests/integration/cases/maven_acra_unusual_tag/test.yaml new file mode 100644 index 000000000..9f64bb992 --- /dev/null +++ b/tests/integration/cases/maven_acra_unusual_tag/test.yaml @@ -0,0 +1,32 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Testing a repository with a non-utf8 tag. + +tags: +- macaron-python-package + +steps: +- name: Run macaron analyze + kind: analyze + options: + command_args: + - -purl + - pkg:maven/ch.acra/acra-core@5.12.0 +- name: Run macaron verify-policy to verify passed/failed checks + kind: verify + options: + policy: policy.dl +- name: Run macaron find-source + kind: find-source + options: + command_args: + - -purl + - pkg:maven/ch.acra/acra-core@5.12.0 +- name: Compare the contents of the report against a known correct one + kind: compare + options: + kind: find_source + result: output/reports/maven/ch_acra/acra-core.source.json + expected: acra-core.source.json diff --git a/tests/integration/cases/repo_finder_pypi_find_source/test.yaml b/tests/integration/cases/repo_finder_pypi_find_source/test.yaml index dee8cdf08..3544e79ca 100644 --- a/tests/integration/cases/repo_finder_pypi_find_source/test.yaml +++ b/tests/integration/cases/repo_finder_pypi_find_source/test.yaml @@ -8,7 +8,7 @@ tags: - macaron-python-package steps: -- name: Run macaron analyze +- name: Run macaron find-source kind: find-source options: command_args: diff --git a/tests/provenance/test_provenance_finder.py b/tests/provenance/test_provenance_finder.py index ba19a7e4a..5a1148364 100644 --- a/tests/provenance/test_provenance_finder.py +++ b/tests/provenance/test_provenance_finder.py @@ -90,6 +90,10 @@ class MockTag: def __str__(self) -> str: return self.commit + def name(self) -> str: + """Return the name of the tag.""" + return "tag_name" + repo = SimpleNamespace(tags=[MockTag()]) diff --git a/tests/repo_finder/test_commit_finder.py b/tests/repo_finder/test_commit_finder.py index dba0e8f1d..3fdefcb36 100644 --- a/tests/repo_finder/test_commit_finder.py +++ b/tests/repo_finder/test_commit_finder.py @@ -18,12 +18,13 @@ from macaron.repo_finder import commit_finder from macaron.repo_finder.commit_finder import AbstractPurlType, determine_optional_suffix_index from macaron.repo_finder.repo_finder_enums import CommitFinderInfo +from macaron.repo_finder.repo_utils import get_repo_tags from tests.slsa_analyzer.mock_git_utils import commit_files, initiate_repo logger: logging.Logger = logging.getLogger(__name__) BASE_DIR = os.path.dirname(os.path.abspath(__file__)) -REPO_DIR = os.path.join(BASE_DIR, "mock_repos", "commit_finder/sample_repo") +REPO_DIR = os.path.join(BASE_DIR, "mock_repos", "commit_finder", "sample_repo") UNICODE_VERSION = "雪" # The Japanese character for "snow". TAG_VERSION = "2.3.4" TAG_VERSION_2 = "4.5.2" @@ -238,14 +239,6 @@ def test_commit_finder_repo_purl_success(mocked_repo_expanded: Git, mocked_repo_ assert outcome == CommitFinderInfo.MATCHED -def test_commit_finder_tag_no_commit(mocked_repo: Git) -> None: - """Test the Commit Finder on a mocked repository that has a tag with no commit.""" - mocked_repo.repo.create_tag("TEST", ref=mocked_repo.repo.heads.master.commit.tree) - match, outcome = commit_finder.find_commit(mocked_repo, PackageURL.from_string("pkg:maven/apache/maven@TEST")) - assert not match - assert outcome == CommitFinderInfo.NO_TAGS_WITH_COMMITS - - @pytest.mark.parametrize( ("version", "parts", "expected"), [ @@ -260,6 +253,26 @@ def test_commit_finder_optional_suffixes(version: str, parts: list, expected: in assert determine_optional_suffix_index(version, parts) == expected +def test_get_repo_tags(mocked_repo_empty_commit: Any) -> None: + """Test the get repo tags utils function.""" + # Create the repository object. + repo = Git(os.path.join(REPO_DIR)) + + # Create a non-utf8 tag in the packed references file. + ref_file = os.path.join(REPO_DIR, ".git", "packed-refs") + with open(ref_file, "w", encoding="ISO-8859-1") as file: + file.write(f"{mocked_repo_empty_commit.hexsha} refs/tags/1.0\u00c3\n") + + # Using Pydriller to retrieve the tags fails. + with pytest.raises(UnicodeDecodeError): + _ = repo.repo.tags + + # Check the tags can still be retrieved using the corrected function. + tags = get_repo_tags(repo) + assert tags + assert "1.0\u00c3" in tags + + @given(text()) @settings(max_examples=10000, deadline=None) def test_pattern_generation(version: str) -> None: