diff --git a/gitshield/claude.py b/gitshield/claude.py index bdef996..7fb8e64 100644 --- a/gitshield/claude.py +++ b/gitshield/claude.py @@ -92,13 +92,13 @@ def uninstall_hook() -> None: group["hooks"] = remaining filtered.append(group) - settings["hooks"]["PreToolUse"] = filtered + settings.setdefault("hooks", {})["PreToolUse"] = filtered # Clean up empty structures - if not settings["hooks"]["PreToolUse"]: - del settings["hooks"]["PreToolUse"] - if not settings["hooks"]: - del settings["hooks"] + if not settings.get("hooks", {}).get("PreToolUse"): + settings.get("hooks", {}).pop("PreToolUse", None) + if not settings.get("hooks"): + settings.pop("hooks", None) _save_settings(settings) click.echo(colorize("GitShield hook removed from Claude Code.", Colors.GREEN)) diff --git a/gitshield/cli.py b/gitshield/cli.py index 94dad75..9e6daa0 100644 --- a/gitshield/cli.py +++ b/gitshield/cli.py @@ -7,7 +7,7 @@ import click from . import __version__ -from .config import filter_findings, load_config, load_ignore_list, find_git_root +from .config import build_custom_patterns, filter_findings, load_config, load_ignore_list, find_git_root from .formatter import print_findings, print_json, print_blocked_message, colorize, Colors from .scanner import scan_path, ScannerError @@ -29,11 +29,11 @@ def main(): def scan(path: str, staged: bool, no_git: bool, as_json: bool, sarif: bool, quiet: bool): """Scan for secrets in PATH (default: current directory).""" try: - findings = scan_path(path, staged_only=staged, no_git=no_git) + config = load_config(Path(path)) + findings = scan_path(path, staged_only=staged, no_git=no_git, scan_tests=config.scan_tests) # Filter ignored ignores = load_ignore_list(Path(path)) - config = load_config(Path(path)) findings = filter_findings(findings, ignores, config=config) # Output @@ -277,12 +277,9 @@ def patrol(repo: str, limit: int, dry_run: bool, stats: bool): click.echo(f" Secrets found: {total_findings}") click.echo(f" Notifications: {notified_count}") - except GitHubError as e: + except (GitHubError, ScannerError) as e: click.echo(colorize(f"Error: {e}", Colors.RED), err=True) sys.exit(1) - except Exception as e: - click.echo(colorize(f"Error: {e}", Colors.RED), err=True) - sys.exit(2) if __name__ == "__main__": diff --git a/gitshield/config.py b/gitshield/config.py index a88e0f8..211d1d8 100644 --- a/gitshield/config.py +++ b/gitshield/config.py @@ -8,12 +8,14 @@ from __future__ import annotations import fnmatch +import functools import os from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, List, Optional, Set -from .scanner import Finding +from .models import Finding +from .patterns import Pattern # --------------------------------------------------------------------------- # TOML parser import -- gracefully degrade when unavailable @@ -205,14 +207,22 @@ def create_default_config(path: Path, force: bool = False) -> Path: # --------------------------------------------------------------------------- # Filtering # --------------------------------------------------------------------------- +@functools.lru_cache(maxsize=512) +def _compile_glob(pattern: str): + """Compile a glob pattern to a regex (cached).""" + import re + return re.compile(fnmatch.translate(pattern)) + + def _matches_any_glob(filepath: str, patterns: List[str]) -> bool: """Check if *filepath* matches any of the given glob patterns.""" for pattern in patterns: + compiled = _compile_glob(pattern) # Match against the full relative path - if fnmatch.fnmatch(filepath, pattern): + if compiled.fullmatch(filepath): return True # Also match against just the filename - if fnmatch.fnmatch(Path(filepath).name, pattern): + if compiled.fullmatch(Path(filepath).name): return True return False @@ -256,6 +266,52 @@ def filter_findings( return filtered +# --------------------------------------------------------------------------- +# Custom pattern builder +# --------------------------------------------------------------------------- + +def build_custom_patterns(config: "GitShieldConfig") -> List[Pattern]: + """Convert config.custom_patterns dicts into Pattern objects. + + Skips entries with invalid severity or un-compilable regex (logs to stderr). + Returns an empty list when config has no custom patterns. + """ + import re + import sys + + built: List[Pattern] = [] + for raw in config.custom_patterns: + pattern_id = str(raw.get("name", "custom-pattern")) + regex_str = str(raw.get("regex", "")) + description = str(raw.get("description", "")) + severity = str(raw.get("severity", "medium")) + entropy_threshold = raw.get("entropy_threshold") + + if not regex_str: + print(f"gitshield: custom pattern '{pattern_id}' has no regex, skipping", file=sys.stderr) + continue + + try: + compiled = re.compile(regex_str) + except re.error as exc: + print(f"gitshield: custom pattern '{pattern_id}' has invalid regex: {exc}", file=sys.stderr) + continue + + try: + built.append(Pattern( + id=pattern_id, + name=pattern_id, + regex=compiled, + description=description, + severity=severity, + entropy_threshold=float(entropy_threshold) if entropy_threshold is not None else None, + )) + except ValueError as exc: + print(f"gitshield: custom pattern '{pattern_id}' error: {exc}", file=sys.stderr) + + return built + + # --------------------------------------------------------------------------- # Legacy ignore file creation (unchanged API) # --------------------------------------------------------------------------- diff --git a/gitshield/engine.py b/gitshield/engine.py index b4a050f..e0a3313 100644 --- a/gitshield/engine.py +++ b/gitshield/engine.py @@ -6,11 +6,12 @@ import fnmatch import os +import re import subprocess from pathlib import Path from typing import List, Optional, Set, Union -from .models import Finding +from .models import Finding, truncate_secret from .patterns import entropy, PATTERNS # Directories to always skip during tree walks. @@ -40,17 +41,18 @@ "-- gitshield:ignore", ) +# ReDoS mitigations: cap gitignore pattern count and length. +_MAX_GITIGNORE_PATTERNS: int = 500 +_MAX_GITIGNORE_PATTERN_LEN: int = 200 + +# Test file patterns -- skipped when scan_tests=False. +_TEST_FILE_PATTERNS = ("test_*.py", "*_test.py") + # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- -def _truncate(text: str, max_len: int = 20) -> str: - """Truncate secret for safe display.""" - if len(text) <= max_len: - return text - return text[:max_len] + "..." - def _is_binary_file(filepath: Path) -> bool: """Return True if *filepath* looks like a binary file (null byte in first 8 KB).""" @@ -64,8 +66,8 @@ def _is_binary_file(filepath: Path) -> bool: def _should_skip_path(path: Path) -> bool: """Return True if *path* should be skipped based on directory name or extension.""" - # Check each component for skip-listed directory names. - for part in path.parts: + # Check only directory components for skip-listed directory names (not filename). + for part in path.parent.parts: if part in _SKIP_DIRS: return True # Check binary extensions. @@ -85,25 +87,39 @@ def _parse_gitignore(root: Path) -> List[str]: line = line.strip() if not line or line.startswith("#"): continue + # Skip pathologically long patterns that could cause ReDoS. + if len(line) > _MAX_GITIGNORE_PATTERN_LEN: + continue patterns.append(line) except OSError: pass - return patterns + return patterns[:_MAX_GITIGNORE_PATTERNS] -def _matches_gitignore(rel_path: str, ignore_patterns: List[str]) -> bool: - """Return True if *rel_path* matches any gitignore pattern.""" - for pattern in ignore_patterns: - # Directory-only pattern (trailing slash): match against path components. +def _compile_gitignore_patterns(patterns: List[str]) -> List[tuple]: + """Pre-compile gitignore patterns into (is_dir, compiled_regex) tuples.""" + compiled = [] + for pattern in patterns: if pattern.endswith("/"): dir_pattern = pattern.rstrip("/") - if any(fnmatch.fnmatch(part, dir_pattern) for part in Path(rel_path).parts): + compiled.append((True, re.compile(fnmatch.translate(dir_pattern)))) + else: + compiled.append((False, re.compile(fnmatch.translate(pattern)))) + return compiled + + +def _matches_gitignore(rel_path: str, ignore_patterns: List[tuple]) -> bool: + """Return True if *rel_path* matches any pre-compiled gitignore pattern.""" + for is_dir, compiled_re in ignore_patterns: + if is_dir: + # Directory-only pattern: match against path components. + if any(compiled_re.fullmatch(part) for part in Path(rel_path).parts): return True else: # Match against full relative path and also the basename. - if fnmatch.fnmatch(rel_path, pattern): + if compiled_re.fullmatch(rel_path): return True - if fnmatch.fnmatch(Path(rel_path).name, pattern): + if compiled_re.fullmatch(Path(rel_path).name): return True return False @@ -117,39 +133,53 @@ def scan_text( filename: str = "", line_offset: int = 0, config_threshold: Optional[float] = None, + extra_patterns: Optional[List] = None, ) -> List[Finding]: """Scan a text string line-by-line against all patterns. Args: text: The full text to scan. filename: Logical filename for reporting. - line_offset: Added to every reported line number. + line_offset: A 0-based offset added to the 1-based line index when + computing the reported line number (``line_number = idx + line_offset`` + where ``idx`` starts at 1). The default of 0 means the first line + of *text* is reported as line 1. To report absolute line numbers + when *text* is a slice starting at line N of a larger file, pass + ``line_offset=N-1`` so that the first scanned line is reported as N. Returns: List of Finding objects (one per pattern match per line). """ findings: List[Finding] = [] lines = text.splitlines() + all_patterns = list(PATTERNS) + list(extra_patterns or []) for idx, line in enumerate(lines, start=1): # Honour inline ignore directives. if any(marker in line for marker in _IGNORE_MARKERS): continue - for pattern in PATTERNS: + for pattern in all_patterns: match = pattern.regex.search(line) if match is None: continue matched_text = match.group(0) + # Use the first capturing group for entropy/display when available. + # This avoids prefix inflation (e.g. 'api_key = ' before the value). + secret_text = ( + match.group(1) + if match.lastindex and match.lastindex >= 1 + else matched_text + ) # If the pattern specifies an entropy threshold, enforce it. if pattern.entropy_threshold is not None: - ent = entropy(matched_text) + ent = entropy(secret_text) if ent < pattern.entropy_threshold: continue elif config_threshold is not None: - ent = entropy(matched_text) + ent = entropy(secret_text) if ent < config_threshold: continue else: @@ -161,7 +191,7 @@ def scan_text( file=filename, line=line_number, rule_id=pattern.id, - secret=_truncate(matched_text), + secret=truncate_secret(secret_text), fingerprint=f"{filename}:{pattern.id}:{line_number}", entropy=ent, severity=pattern.severity, @@ -170,7 +200,11 @@ def scan_text( return findings -def scan_file(filepath: Union[str, Path]) -> List[Finding]: +def scan_file( + filepath: Union[str, Path], + config_threshold: Optional[float] = None, + extra_patterns: Optional[List] = None, +) -> List[Finding]: """Scan a single file for secrets. Binary files (null bytes in the first 8 KB) are silently skipped. @@ -178,6 +212,8 @@ def scan_file(filepath: Union[str, Path]) -> List[Finding]: Args: filepath: Path to the file to scan. + config_threshold: Entropy threshold override for patterns without one. + extra_patterns: Additional Pattern objects beyond the built-in list. Returns: List of Finding objects. @@ -187,15 +223,35 @@ def scan_file(filepath: Union[str, Path]) -> List[Finding]: if not filepath.is_file(): return [] - if _is_binary_file(filepath): + # Single-read: check for binary (null bytes in first 8 KB) and decode in one pass. + try: + with open(filepath, "rb") as fh: + raw = fh.read() + except (OSError, IOError): + return [] + + if b"\x00" in raw[:8192]: return [] try: - text = filepath.read_text(encoding="utf-8", errors="replace") - except (OSError, IOError): + text = raw.decode("utf-8", errors="replace") + except (UnicodeDecodeError, ValueError): return [] - return scan_text(text, filename=str(filepath)) + return scan_text( + text, + filename=str(filepath), + config_threshold=config_threshold, + extra_patterns=extra_patterns, + ) + + +def _is_test_file(filename: str) -> bool: + """Return True if *filename* matches a test file pattern.""" + for pattern in _TEST_FILE_PATTERNS: + if fnmatch.fnmatch(filename, pattern): + return True + return False def scan_directory( @@ -203,6 +259,9 @@ def scan_directory( staged_only: bool = False, no_git: bool = False, respect_gitignore: bool = True, + config_threshold: Optional[float] = None, + extra_patterns: Optional[List] = None, + scan_tests: bool = True, ) -> List[Finding]: """Walk a directory tree and scan every eligible file. @@ -211,6 +270,9 @@ def scan_directory( staged_only: If True, only scan files staged in git (``git diff --cached``). no_git: If True, ignore git entirely (no gitignore, no staged filter). respect_gitignore: Honour ``.gitignore`` patterns (unless *no_git*). + config_threshold: Entropy threshold override for patterns without one. + extra_patterns: Additional Pattern objects beyond the built-in list. + scan_tests: If False, skip test files (test_*.py, *_test.py). Returns: Aggregated list of Finding objects. @@ -224,9 +286,10 @@ def scan_directory( return _scan_staged(root) # ---- full tree walk ---- - ignore_patterns: List[str] = [] + ignore_patterns: List[tuple] = [] if respect_gitignore and not no_git: - ignore_patterns = _parse_gitignore(root) + raw_patterns = _parse_gitignore(root) + ignore_patterns = _compile_gitignore_patterns(raw_patterns) findings: List[Finding] = [] @@ -240,6 +303,10 @@ def scan_directory( if _should_skip_path(file_path): continue + # Skip test files when scan_tests is disabled. + if not scan_tests and _is_test_file(filename): + continue + # Gitignore filtering. if ignore_patterns: try: @@ -249,7 +316,13 @@ def scan_directory( if _matches_gitignore(rel, ignore_patterns): continue - findings.extend(scan_file(file_path)) + findings.extend( + scan_file( + file_path, + config_threshold=config_threshold, + extra_patterns=extra_patterns, + ) + ) return findings @@ -258,6 +331,7 @@ def scan_content( content: str, context: str = "content", config_threshold: Optional[float] = None, + extra_patterns: Optional[List] = None, ) -> List[Finding]: """Quick scan of arbitrary content (convenience wrapper for hooks). @@ -267,11 +341,17 @@ def scan_content( content: The text to scan. context: Label used as the ``file`` field in findings. config_threshold: Entropy threshold override for patterns without a threshold. + extra_patterns: Additional Pattern objects beyond the built-in list. Returns: List of Finding objects. """ - return scan_text(content, filename=context, config_threshold=config_threshold) + return scan_text( + content, + filename=context, + config_threshold=config_threshold, + extra_patterns=extra_patterns, + ) # --------------------------------------------------------------------------- @@ -286,7 +366,10 @@ def _scan_staged(root: Path) -> List[Finding]: capture_output=True, text=True, cwd=str(root), + timeout=30, ) + except subprocess.TimeoutExpired: + return [] except (OSError, FileNotFoundError): return [] diff --git a/gitshield/formatter.py b/gitshield/formatter.py index 99209f6..b781526 100644 --- a/gitshield/formatter.py +++ b/gitshield/formatter.py @@ -1,12 +1,13 @@ """Pretty terminal output for scan results.""" +import functools import json import shlex import sys from typing import List from . import __version__ -from .scanner import Finding +from .models import Finding # ANSI colors @@ -21,8 +22,13 @@ class Colors: RESET = "\033[0m" +@functools.lru_cache(maxsize=None) def supports_color() -> bool: - """Check if terminal supports color.""" + """Check if terminal supports color. + + Cached for the process lifetime — stdout's tty status never changes + during a run, and colorize() calls this on every formatted line. + """ return hasattr(sys.stdout, "isatty") and sys.stdout.isatty() diff --git a/gitshield/hook.py b/gitshield/hook.py index 7be188d..d339caa 100644 --- a/gitshield/hook.py +++ b/gitshield/hook.py @@ -5,7 +5,7 @@ from typing import List from .engine import scan_content -from .scanner import Finding +from .models import Finding # Files that should legitimately contain secrets (don't block) @@ -35,10 +35,15 @@ def _is_allowed_path(filepath: str) -> bool: """Check if filepath is in the allowlist (test files, examples, etc.).""" import fnmatch + from pathlib import Path as _Path lower = filepath.lower() + path_parts = _Path(lower).parts + dir_parts = _Path(lower).parent.parts for pattern in ALLOWED_PATHS: if pattern.endswith("/"): - if f"/{pattern}" in lower or lower.startswith(pattern): + # Directory allowlist: check if any path component exactly matches + dir_name = pattern.rstrip("/") + if dir_name in dir_parts: return True elif "*" in pattern: if fnmatch.fnmatch(filepath.split("/")[-1], pattern): @@ -150,9 +155,13 @@ def main() -> None: result = handle_hook(input_data) print(json.dumps(result)) except Exception as e: - # Fail open — never block on hook errors + # Fail open — never block on hook errors. print(json.dumps({"result": "approve"})) - print(f"gitshield hook error: {e}", file=sys.stderr) + print( + f"gitshield: scanning failed ({type(e).__name__}), " + f"tool call approved without scan: {e}", + file=sys.stderr, + ) sys.exit(0) diff --git a/gitshield/models.py b/gitshield/models.py index cee63d2..0160dd4 100644 --- a/gitshield/models.py +++ b/gitshield/models.py @@ -21,6 +21,20 @@ class Finding: severity: str = "medium" +def truncate_secret(secret: str, max_len: int = 20) -> str: + """Truncate a secret value for safe display, keeping both start and end. + + For long secrets this produces ``start...end`` which is more useful for + identification than a start-only truncation (both engines now use the same + format regardless of whether the finding came from the native engine or + gitleaks). + """ + if len(secret) <= max_len: + return secret + keep = (max_len - 3) // 2 + return f"{secret[:keep]}...{secret[-keep:]}" + + class ScannerError(Exception): """Scanner-related errors.""" pass diff --git a/gitshield/monitor.py b/gitshield/monitor.py index 69bd3c2..ee492aa 100644 --- a/gitshield/monitor.py +++ b/gitshield/monitor.py @@ -143,6 +143,10 @@ def clone_and_scan(repo: RepoInfo, skip_recent: bool = True) -> List[Finding]: if skip_recent and was_scanned_recently(repo.url): return [] + # Validate clone URL to prevent injection via spoofed API responses. + if not repo.clone_url.startswith("https://github.com/"): + raise GitHubError(f"Invalid clone URL: {repo.clone_url!r}") + # Create temp directory for clone; resolve to handle /tmp -> /private/tmp on macOS. temp_dir = str(Path(tempfile.mkdtemp(prefix="gitshield_")).resolve()) @@ -173,7 +177,7 @@ def clone_and_scan(repo: RepoInfo, skip_recent: bool = True) -> List[Finding]: except subprocess.TimeoutExpired: return [] - except Exception: + except (OSError, ValueError): return [] finally: # Clean up diff --git a/gitshield/notifier.py b/gitshield/notifier.py index 9fea428..354c97c 100644 --- a/gitshield/notifier.py +++ b/gitshield/notifier.py @@ -6,7 +6,7 @@ import requests from .config import get_github_token -from .scanner import Finding +from .models import Finding from .monitor import RepoInfo from .db import was_notified, mark_notified, get_notified_fingerprints @@ -125,21 +125,17 @@ def create_github_issue( if not token: raise NotifierError("GITHUB_TOKEN not set") - # Build findings table - findings_table = "\n".join([ - f"| `{f.file}` | {f.line} | {f.rule_id} |" - for f in findings - ]) - title = "[Security] Potential secrets exposed in repository" + # Build rule type summary (no file paths or line numbers to avoid leaking metadata) + rule_types = sorted(set(f.rule_id for f in findings)) + rule_summary = ", ".join(rule_types) + body = f"""## GitShield Security Alert -Potential secrets were detected in this repository: +Potential secrets were detected in this repository ({len(findings)} finding(s), types: {rule_summary}). -| File | Line | Type | -|------|------|------| -{findings_table} +Run `gitshield scan` locally for full details including file paths and line numbers. ### Recommended actions diff --git a/gitshield/scanner.py b/gitshield/scanner.py index f9a6b9f..790c7e7 100644 --- a/gitshield/scanner.py +++ b/gitshield/scanner.py @@ -1,6 +1,8 @@ """Secret detection — native engine + optional gitleaks fallback.""" +import functools import json +import os import shutil import subprocess import tempfile @@ -9,11 +11,16 @@ # Re-export from models.py so existing imports of Finding/ScannerError/GitleaksNotFound # from .scanner continue to work without modification. -from .models import Finding, GitleaksNotFound, ScannerError # noqa: F401 +from .models import Finding, GitleaksNotFound, ScannerError, truncate_secret # noqa: F401 +@functools.lru_cache(maxsize=None) def _has_gitleaks() -> Optional[str]: - """Return gitleaks binary path if installed, else None.""" + """Return gitleaks binary path if installed, else None. + + Cached for the process lifetime — the binary won't appear or disappear + during a single run, and this is called on every hook invocation. + """ return shutil.which("gitleaks") @@ -33,6 +40,8 @@ def _scan_with_gitleaks( tmp_dir = tempfile.mkdtemp() report_path = str(Path(tmp_dir) / "report.json") + # Restrict report file permissions on multi-user systems. + os.chmod(tmp_dir, 0o700) try: cmd = [gitleaks] @@ -51,12 +60,14 @@ def _scan_with_gitleaks( "--exit-code", "0", ]) - result = subprocess.run(cmd, capture_output=True, text=True) + result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) if result.stderr and "error" in result.stderr.lower(): raise ScannerError(f"Gitleaks error: {result.stderr.strip()}") report_file = Path(report_path) + if report_file.exists(): + os.chmod(report_path, 0o600) if not report_file.exists() or report_file.stat().st_size == 0: return [] @@ -72,7 +83,7 @@ def _scan_with_gitleaks( file=item.get("File", ""), line=item.get("StartLine", 0), rule_id=item.get("RuleID", "unknown"), - secret=_truncate_secret(item.get("Secret", "")), + secret=truncate_secret(item.get("Secret", "")), fingerprint=item.get("Fingerprint", ""), entropy=item.get("Entropy", 0.0), commit=item.get("Commit"), @@ -81,6 +92,8 @@ def _scan_with_gitleaks( return findings + except subprocess.TimeoutExpired: + return [] finally: shutil.rmtree(tmp_dir, ignore_errors=True) @@ -89,11 +102,21 @@ def scan_path( path: str, staged_only: bool = False, no_git: bool = False, + config_threshold: Optional[float] = None, + extra_patterns: Optional[List] = None, + scan_tests: bool = True, ) -> List[Finding]: """Scan a path for secrets using native engine + optional gitleaks. The native engine always runs. If gitleaks is installed, both engines run and results are merged (deduplicated by fingerprint). + + Args: + path: File or directory path to scan. + staged_only: Scan only git-staged files. + no_git: Ignore git entirely. + config_threshold: Entropy threshold override for patterns without one. + extra_patterns: Additional Pattern objects beyond the built-in list. """ resolved = Path(path).resolve() if not resolved.exists(): @@ -104,12 +127,19 @@ def scan_path( # Always run native engine if resolved.is_file(): - native_findings = scan_file(resolved) + native_findings = scan_file( + resolved, + config_threshold=config_threshold, + extra_patterns=extra_patterns, + ) else: native_findings = scan_directory( resolved, staged_only=staged_only, no_git=no_git, + config_threshold=config_threshold, + extra_patterns=extra_patterns, + scan_tests=scan_tests, ) # Try gitleaks as supplement (not required) diff --git a/tests/fixtures/secret_file.py b/tests/fixtures/secret_file.py index 39683b1..014b8b0 100644 --- a/tests/fixtures/secret_file.py +++ b/tests/fixtures/secret_file.py @@ -1,3 +1,3 @@ -# This file is for testing only -AWS_KEY = "AKIA1234567890ABCDEF" +# This file is for testing only — all keys are deliberately fake and invalid +AWS_KEY = "AKIAFAKETESTKEY00001" GITHUB_TOKEN = "ghp_ABCDEFghijklmn1234567890abcdefghij"