Skip to content

Commit 9940f19

Browse files
authored
chore: handle non-utf8 tags (#1143)
This PR allows tags with non-utf8 characters to be parsed and handled. Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
1 parent bfa4a33 commit 9940f19

File tree

12 files changed

+267
-91
lines changed

12 files changed

+267
-91
lines changed

src/macaron/provenance/provenance_finder.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from macaron.config.defaults import defaults
1717
from macaron.repo_finder.commit_finder import AbstractPurlType, determine_abstract_purl_type
1818
from macaron.repo_finder.repo_finder_deps_dev import DepsDevRepoFinder
19+
from macaron.repo_finder.repo_utils import get_repo_tags
1920
from macaron.slsa_analyzer.analyze_context import AnalyzeContext
2021
from macaron.slsa_analyzer.checks.provenance_available_check import ProvenanceAvailableException
2122
from macaron.slsa_analyzer.ci_service import GitHubActions
@@ -387,15 +388,10 @@ def find_provenance_from_ci(
387388
if not digest:
388389
logger.debug("Cannot retrieve asset provenance without commit digest.")
389390
return None
390-
tags = git_obj.repo.tags
391-
for _tag in tags:
392-
try:
393-
tag_commit = str(_tag.commit)
394-
except ValueError as error:
395-
logger.debug("Commit of tag is a blob or tree: %s", error)
396-
continue
397-
if tag_commit and tag_commit == digest:
398-
tag = str(_tag)
391+
tags = get_repo_tags(git_obj)
392+
for key, value in tags.items():
393+
if value == digest:
394+
tag = key
399395
break
400396

401397
if not tag:

src/macaron/repo_finder/commit_finder.py

Lines changed: 12 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@
77
from enum import Enum
88
from re import Pattern
99

10-
from git import TagReference
1110
from gitdb.exc import BadName
1211
from packageurl import PackageURL
1312
from pydriller import Commit, Git
1413

1514
from macaron.repo_finder import repo_finder_deps_dev, to_domain_from_known_purl_types
1615
from macaron.repo_finder.repo_finder_enums import CommitFinderInfo
16+
from macaron.repo_finder.repo_utils import get_repo_tags
1717
from macaron.slsa_analyzer.git_service import GIT_SERVICES
1818

1919
logger: logging.Logger = logging.getLogger(__name__)
@@ -247,27 +247,13 @@ def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) ->
247247
logger.debug("Searching for commit of artifact version using tags: %s@%s", name, version)
248248

249249
# Only consider tags that have a commit.
250-
repo_tags = git_obj.repo.tags
250+
repo_tags = get_repo_tags(git_obj)
251251
if not repo_tags:
252252
logger.debug("No tags found for %s", name)
253253
return None, CommitFinderInfo.NO_TAGS
254254

255-
valid_tags = {}
256-
for tag in repo_tags:
257-
commit = _get_tag_commit(tag)
258-
if not commit:
259-
logger.debug("No commit found for tag: %s", tag)
260-
continue
261-
262-
tag_name = str(tag)
263-
valid_tags[tag_name] = tag
264-
265-
if not valid_tags:
266-
logger.debug("No tags with commits found for %s", name)
267-
return None, CommitFinderInfo.NO_TAGS_WITH_COMMITS
268-
269255
# Match tags.
270-
matched_tags, outcome = match_tags(list(valid_tags.keys()), name, version)
256+
matched_tags, outcome = match_tags(list(repo_tags.keys()), name, version)
271257

272258
if not matched_tags:
273259
logger.debug("No tags matched for %s", name)
@@ -279,25 +265,21 @@ def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) ->
279265
logger.debug("Up to 5 others: %s", matched_tags[1:6])
280266

281267
tag_name = matched_tags[0]
282-
tag = valid_tags[tag_name]
283-
if not tag:
284-
# Tag names are taken from valid_tags and should always exist within it.
285-
logger.debug("Missing tag name from tag dict: %s not in %s", tag_name, valid_tags.keys())
286-
287-
try:
288-
hexsha = tag.commit.hexsha
289-
except ValueError:
290-
logger.debug("Error trying to retrieve digest of commit: %s", tag.commit)
291-
return None, CommitFinderInfo.NO_TAG_COMMIT
268+
commit = None
269+
if tag_name not in repo_tags:
270+
# Tag names are taken from repo_tags and should always exist within it.
271+
logger.debug("Missing tag name from tag dict: %s not in %s", tag_name, repo_tags.keys())
272+
else:
273+
commit = repo_tags[tag_name]
292274

293275
logger.debug(
294276
"Found tag %s with commit %s for artifact version %s@%s",
295-
tag,
296-
hexsha,
277+
tag_name,
278+
commit,
297279
name,
298280
version,
299281
)
300-
return hexsha if hexsha else None, CommitFinderInfo.MATCHED
282+
return commit if commit else None, CommitFinderInfo.MATCHED
301283

302284

303285
def _split_name(name: str) -> list[str]:
@@ -907,15 +889,3 @@ def _create_suffix_tag_comparison_pattern(tag_part: str) -> Pattern | None:
907889

908890
# Combine the alphabetic and zero-extended numeric parts.
909891
return re.compile(f"{versioned_string_result.group(1)}(0*){versioned_string_result.group(3)}", re.IGNORECASE)
910-
911-
912-
def _get_tag_commit(tag: TagReference) -> Commit | None:
913-
"""Return the commit of the passed tag.
914-
915-
This is a standalone function to more clearly handle the potential error raised by accessing the tag's commit
916-
property.
917-
"""
918-
try:
919-
return tag.commit
920-
except ValueError:
921-
return None

src/macaron/repo_finder/repo_finder_enums.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,6 @@ class CommitFinderInfo(Enum):
108108
#: Reported if the repository has no Git tags.
109109
NO_TAGS = "No Git tags"
110110

111-
#: Reported if the repository has no Git tags with associated commits.
112-
NO_TAGS_WITH_COMMITS = "No Git tags with commits"
113-
114111
#: Reported if the tag selected from the repository fails to resolve to a commit despite having one associated with
115112
# it.
116113
NO_TAG_COMMIT = "No valid commit found for Git tag"

src/macaron/repo_finder/repo_utils.py

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,16 @@
66
import logging
77
import os
88
import string
9+
import subprocess # nosec B404
910
from urllib.parse import urlparse
1011

1112
from packageurl import PackageURL
13+
from pydriller import Git
1214

1315
from macaron.config.global_config import global_config
1416
from macaron.slsa_analyzer.git_service import GIT_SERVICES, BaseGitService
1517
from macaron.slsa_analyzer.git_service.base_git_service import NoneGitService
16-
from macaron.slsa_analyzer.git_url import GIT_REPOS_DIR
18+
from macaron.slsa_analyzer.git_url import GIT_REPOS_DIR, decode_git_tags, parse_git_tags
1719

1820
logger: logging.Logger = logging.getLogger(__name__)
1921

@@ -169,3 +171,63 @@ def check_repo_urls_are_equivalent(repo_1: str, repo_2: str) -> bool:
169171
return False
170172

171173
return True
174+
175+
176+
def get_repo_tags(git_obj: Git) -> dict[str, str]:
177+
"""Retrieve the tags of the passed repo.
178+
179+
This will be attempted using the related Pydriller Git function, but will fall back to a Git subprocess for
180+
repositories that contain non utf-8 tags.
181+
182+
Parameters
183+
----------
184+
git_obj: Git
185+
The Git object of the repository.
186+
187+
Returns
188+
-------
189+
dict[str, str]
190+
A dictionary of tags mapped to commits.
191+
"""
192+
tags = None
193+
try:
194+
tags = git_obj.repo.tags
195+
except UnicodeDecodeError as error:
196+
logger.debug("Failed to retrieve tags in utf-8 encoding: %s", error)
197+
198+
if tags:
199+
tag_dict: dict[str, str] = {}
200+
for tag in tags:
201+
try:
202+
tag_commit = str(tag.commit)
203+
except ValueError as error:
204+
logger.debug("Commit of tag is a blob or tree: %s", error)
205+
continue
206+
tag_dict[tag.name] = tag_commit
207+
return tag_dict
208+
209+
# Retrieve tags using a Git subprocess.
210+
repository_path = git_obj.repo.working_tree_dir
211+
if not os.path.isdir(repository_path):
212+
logger.debug("")
213+
return {}
214+
try:
215+
result = subprocess.run( # nosec B603
216+
args=["git", "show-ref", "--tags", "-d"],
217+
capture_output=True,
218+
cwd=repository_path,
219+
check=False,
220+
)
221+
except (subprocess.CalledProcessError, OSError) as error:
222+
logger.debug("Failed to retrieve repository tags: %s", error)
223+
return {}
224+
225+
if result.returncode != 0:
226+
logger.debug("Failed to retrieve repository tags.")
227+
return {}
228+
229+
decoded_data = decode_git_tags(result.stdout)
230+
if not decoded_data:
231+
return {}
232+
233+
return parse_git_tags(decoded_data)

src/macaron/slsa_analyzer/git_url.py

Lines changed: 64 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from macaron.config.global_config import global_config
2424
from macaron.environment_variables import get_patched_env
2525
from macaron.errors import CloneError, GitTagError
26+
from macaron.util import BytesDecoder
2627

2728
logger: logging.Logger = logging.getLogger(__name__)
2829

@@ -412,7 +413,7 @@ def list_remote_references(arguments: list[str], repo: str) -> str | None:
412413
logger.error("Failed to retrieve remote references from repo: %s", repo)
413414
return None
414415

415-
return result.stdout.decode("utf-8")
416+
return decode_git_tags(result.stdout)
416417

417418

418419
def resolve_local_path(start_dir: str, local_path: str) -> str:
@@ -944,7 +945,7 @@ def is_commit_hash(value: str) -> bool:
944945
return bool(re.match(pattern, value))
945946

946947

947-
def get_tags_via_git_remote(repo: str) -> dict[str, str] | None:
948+
def get_tags_via_git_remote(repo: str) -> dict[str, str]:
948949
"""Retrieve all tags from a given repository using ls-remote.
949950
950951
Parameters
@@ -955,35 +956,14 @@ def get_tags_via_git_remote(repo: str) -> dict[str, str] | None:
955956
Returns
956957
-------
957958
dict[str]
958-
A dictionary of tags mapped to their commits, or None if the operation failed..
959+
A dictionary of tags mapped to their commits.
959960
"""
960961
tag_data = list_remote_references(["--tags"], repo)
961962
if not tag_data:
962-
return None
963-
tags = {}
964-
965-
for tag_line in tag_data.splitlines():
966-
tag_line = tag_line.strip()
967-
if not tag_line:
968-
continue
969-
split = tag_line.split("\t")
970-
if len(split) != 2:
971-
continue
972-
possible_tag = split[1]
973-
if possible_tag.endswith("^{}"):
974-
possible_tag = possible_tag[:-3]
975-
elif possible_tag in tags:
976-
# If a tag already exists, it must be the annotated reference of an annotated tag.
977-
# In that case we skip the tag as it does not point to the proper source commit.
978-
# Note that this should only happen if the tags are received out of standard order.
979-
continue
980-
possible_tag = possible_tag.replace("refs/tags/", "")
981-
if not possible_tag:
982-
continue
983-
tags[possible_tag] = split[0]
963+
return {}
984964

965+
tags = parse_git_tags(tag_data)
985966
logger.debug("Found %s tags via ls-remote of %s", len(tags), repo)
986-
987967
return tags
988968

989969

@@ -1055,3 +1035,61 @@ def find_highest_git_tag(tags: set[str]) -> str:
10551035
raise GitTagError("No valid version tag found.")
10561036

10571037
return highest_tag
1038+
1039+
1040+
def parse_git_tags(tag_data: str) -> dict[str, str]:
1041+
"""Parse the tags and commits found within the passed data.
1042+
1043+
Parameters
1044+
----------
1045+
tag_data: str
1046+
The tag data to parse.
1047+
1048+
Returns
1049+
-------
1050+
dict[str, str]
1051+
A dictionary of tags mapped to commits.
1052+
"""
1053+
tags = {}
1054+
for tag_line in tag_data.splitlines():
1055+
tag_line = tag_line.strip()
1056+
if not tag_line:
1057+
continue
1058+
split = re.split("[\t ]", tag_line, maxsplit=1)
1059+
if len(split) != 2:
1060+
continue
1061+
possible_tag = split[1]
1062+
if possible_tag.endswith("^{}"):
1063+
possible_tag = possible_tag[:-3]
1064+
elif possible_tag in tags:
1065+
# If a tag already exists, it must be the annotated reference of an annotated tag.
1066+
# In that case we skip the tag as it does not point to the proper source commit.
1067+
# Note that this should only happen if the tags are received out of standard order.
1068+
continue
1069+
possible_tag = possible_tag.replace("refs/tags/", "")
1070+
if not possible_tag:
1071+
continue
1072+
tags[possible_tag] = split[0]
1073+
1074+
return tags
1075+
1076+
1077+
def decode_git_tags(data: bytes) -> str | None:
1078+
"""Decode the passed Git tag data.
1079+
1080+
Parameters
1081+
----------
1082+
data: bytes
1083+
The data to decode.
1084+
1085+
Returns
1086+
-------
1087+
str | None
1088+
The decoded data, or None if an error occurred.
1089+
"""
1090+
try:
1091+
return data.decode("utf-8")
1092+
except UnicodeDecodeError as error:
1093+
logger.debug("Error decoding stdout as utf-8: %s", error)
1094+
# Try other character encodings.
1095+
return BytesDecoder.decode(data)

src/macaron/util.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,3 +520,49 @@ def copy_file_bulk(file_list: list, src_path: str, target_path: str) -> bool:
520520
return False
521521

522522
return True
523+
524+
525+
class BytesDecoder:
526+
"""This class aims to decode some non-UTF8 bytes to a valid string.
527+
528+
The aim is not to 'correctly' parse the passed data. Only to successfully do so.
529+
It is assumed that an attempt to decode using UTF8 has already failed.
530+
The top 10 most common encodings (after UTF-8) are tried.
531+
"""
532+
533+
# Taken from https://w3techs.com/technologies/overview/character_encoding.
534+
COMMON_ENCODINGS = [
535+
"ISO-8859-1",
536+
"cp1252",
537+
"cp1251",
538+
"euc-jp",
539+
"euc-kr",
540+
"shift_jis",
541+
"gb2312",
542+
"cp1250",
543+
"ISO-8859-2",
544+
"big5",
545+
]
546+
547+
@staticmethod
548+
def decode(data: bytes) -> str | None:
549+
"""Attempt to decode the passed bytes using common encodings.
550+
551+
Parameters
552+
----------
553+
data: bytes
554+
The data to decode.
555+
556+
Returns
557+
-------
558+
str | None
559+
The data as a string if successful, or None.
560+
"""
561+
for encoding in BytesDecoder.COMMON_ENCODINGS:
562+
try:
563+
return data.decode(encoding)
564+
except UnicodeDecodeError:
565+
pass
566+
567+
logger.debug("Failed to decode bytes using most common character encodings.")
568+
return None

0 commit comments

Comments
 (0)