diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..b2ea630 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,28 @@ +name: Backup Utility CI + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python 3.10 + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest + + - name: Run Pytest Validation Suite + run: | + pytest -v tests/ diff --git a/.gitignore b/.gitignore index 1dd557d..fca7fa0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ logs/** *.db *.db-journal -*.json __pycache__ *.py[cod] *$py.class diff --git a/README.md b/README.md index 071e660..68e3274 100644 --- a/README.md +++ b/README.md @@ -22,13 +22,21 @@ Ensure you have the following installed on your Ubuntu/Linux system: * `python3` * `sqlite3` -### 2. Configuration (`config.env`) -Before running the utility, open `config.env` and define: +### 2. Configuration +Before running the utility, you must configure two files: + +**`config.env`**: +Open this file to map your fundamental hardware topology. You must define: 1. The **UUIDs** of your hard drives. 2. The **mount points** where they will be attached. 3. The **sync destination paths** mapping how you cascade your data (e.g., 1TB -> 2TB). 4. Thresholds for safe-sync warnings. +**`auditor_config.json`**: +Open this JSON file to customize the Integrity Auditor's parameters: +1. **`EXT_FILTER`**: Tailor this array with specific file extensions you want protected (e.g., `.jpg`, `.mp4`, `.pdf`). Any non-critical extensions omitted from this list will be intelligently ignored to drastically improve hashing speeds on mechanical drives. +2. **`EXCLUSIONS`**: Directories completely ignored by the auditor. + ### 3. Usage Run the interactive menu wrapper: ```bash diff --git a/auditor.py b/auditor.py index 1189380..017f35d 100755 --- a/auditor.py +++ b/auditor.py @@ -18,58 +18,26 @@ import json from datetime import datetime -# Load configuration from config.env (simple parsing, no external deps) -CONFIG_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.env") +# Load configuration safely from JSON +CONFIG_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "auditor_config.json") def load_config(): - config = {} if not os.path.exists(CONFIG_FILE): print(f"Error: {CONFIG_FILE} not found. Please ensure it exists.") sys.exit(1) with open(CONFIG_FILE, 'r') as f: - for line in f: - line = line.strip() - if line and not line.startswith('#'): - # Handle bash array format for exclusions roughly - if line.startswith('EXCLUSIONS=('): - config['EXCLUSIONS'] = [] - continue - if line.startswith('AUDITOR_EXT_FILTER=('): - config['AUDITOR_EXT_FILTER'] = [] - continue - - # If we are inside an array parsing (hacky but works for the format we defined) - if 'EXCLUSIONS' in config and isinstance(config['EXCLUSIONS'], list) and line == ')': - # Array ended, convert to tuple - continue - elif 'EXCLUSIONS' in config and isinstance(config['EXCLUSIONS'], list): - val = line.strip(' "()') - if val: config['EXCLUSIONS'].append(val) - continue - - if 'AUDITOR_EXT_FILTER' in config and isinstance(config['AUDITOR_EXT_FILTER'], list) and line == ')': - continue - elif 'AUDITOR_EXT_FILTER' in config and isinstance(config['AUDITOR_EXT_FILTER'], list): - val = line.strip(' "()') - if val: config['AUDITOR_EXT_FILTER'].append(val.lower()) - continue - - if '=' in line: - key, val = line.split('=', 1) - config[key] = val.strip(' "') - - # Default fallbacks if parsing didn't catch arrays well - if 'EXCLUSIONS' not in config: config['EXCLUSIONS'] = [] - if 'AUDITOR_EXT_FILTER' not in config: config['AUDITOR_EXT_FILTER'] = [] - if 'AUDITOR_DB_NAME' not in config: config['AUDITOR_DB_NAME'] = 'auditor.db' - - return config + try: + config = json.load(f) + return config + except json.JSONDecodeError as e: + print(f"Error parsing JSON configuration in {CONFIG_FILE}: {e}") + sys.exit(1) CONFIG = load_config() # Database setup -DB_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), CONFIG.get('AUDITOR_DB_NAME', 'auditor.db')) +DB_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), CONFIG.get('DB_NAME', 'auditor.db')) LOG_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "logs") # Ensure logs dir exists @@ -127,7 +95,7 @@ def should_exclude(file_path): return True # 2. Check extension filter (if defined) - ext_filter = CONFIG.get('AUDITOR_EXT_FILTER', []) + ext_filter = CONFIG.get('EXT_FILTER', []) if ext_filter: _, ext = os.path.splitext(file_path) if ext.lower() not in ext_filter: diff --git a/auditor_config.json b/auditor_config.json new file mode 100644 index 0000000..e424d1a --- /dev/null +++ b/auditor_config.json @@ -0,0 +1,18 @@ +{ + "DB_NAME": "auditor.db", + "EXCLUSIONS": [ + ".git/", + "node_modules/", + "venv/", + ".cache/", + "tmp/", + "System Volume Information/", + "$RECYCLE.BIN/" + ], + "EXT_FILTER": [ + ".jpg", ".jpeg", ".png", ".heic", + ".cr2", ".arw", ".dng", ".tif", + ".mp4", ".mov", ".m4v", ".3gp", + ".pdf" + ] +} diff --git a/config.env b/config.env index 2042d44..83e5a5f 100644 --- a/config.env +++ b/config.env @@ -74,8 +74,6 @@ EXCLUSIONS=( KNOWN_DUPLICATES_JSON="known_duplicates.json" # --- Auditor Configuration --- -# Name of the SQLite database file (stored in the backup-utility dir by default) -AUDITOR_DB_NAME="auditor.db" -# Types of files the auditor should hash (extensions) -# Useful for limiting to images/videos. Empty array = hash everything not in EXCLUSIONS. -AUDITOR_EXT_FILTER=(".jpg" ".jpeg" ".png" ".mp4" ".mkv" ".pdf" ".mp3" ".flac") +# Note: The auditor's database name, directory exclusions, and critical +# hashing extension filters have been decoupled and moved to: +# `auditor_config.json` for safer and cleaner Python parsing. diff --git a/find_extensions.py b/find_extensions.py new file mode 100755 index 0000000..714ecaa --- /dev/null +++ b/find_extensions.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +import os +import sys +from collections import Counter + +def main(): + if len(sys.argv) < 2: + print("Usage: python3 find_extensions.py [output_file]") + sys.exit(1) + + target_dir = sys.argv[1] + output_file = sys.argv[2] if len(sys.argv) > 2 else "extension_report.log" + + if not os.path.isdir(target_dir): + print(f"Error: {target_dir} is not a valid directory.") + sys.exit(1) + + print(f"Scanning {target_dir} for file extensions...") + ext_counts = Counter() + + scanned = 0 + # Walk the directory tree + for root, dirs, files in os.walk(target_dir): + for file in files: + scanned += 1 + if scanned % 10000 == 0: + print(f"Scanned {scanned} files...") + + # Extract extension and convert to lowercase + _, ext = os.path.splitext(file) + if ext: + ext_counts[ext.lower()] += 1 + else: + ext_counts[""] += 1 + + print(f"\nScan complete. Total files processed: {scanned}") + + # Save formatted report + with open(output_file, 'w') as f: + f.write(f"Extension Report for: {target_dir}\n") + f.write(f"Total Files: {scanned}\n") + f.write("-" * 40 + "\n") + f.write(f"{'Extension':<20} | {'Count':<10}\n") + f.write("-" * 40 + "\n") + + for ext, count in ext_counts.most_common(): + f.write(f"{ext:<20} | {count:<10}\n") + + print(f"Report saved to: {output_file}") + print("\nTop 15 most common extensions:") + for ext, count in ext_counts.most_common(15): + print(f" {ext:<15} : {count}") + +if __name__ == "__main__": + main() diff --git a/tests/test_auditor.py b/tests/test_auditor.py new file mode 100644 index 0000000..aa27958 --- /dev/null +++ b/tests/test_auditor.py @@ -0,0 +1,44 @@ +import sys +import os +import tempfile +import pytest + +# Ensure parent directory is in python path to import auditor +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +def test_hash_calculation(): + from auditor import calculate_sha256 + + # Create temporary file to hash + with tempfile.NamedTemporaryFile(delete=False) as f: + f.write(b"backup-utility-test") + temp_path = f.name + + try: + # Pre-calculated SHA256 of "backup-utility-test" + expected = "48f23852dc21c9a38e8ffd9f743f847b4d7945b0f4b9006f1635cab462b7fa2b" + assert calculate_sha256(temp_path) == expected + finally: + os.remove(temp_path) + +def test_should_exclude(monkeypatch): + import auditor + + # Mock the configuration payload testing specific scenarios + mock_config = { + "EXCLUSIONS": [".git/", "node_modules/"], + "EXT_FILTER": [".jpg", ".mp4"] + } + monkeypatch.setattr(auditor, 'CONFIG', mock_config) + + # Check directory exclusion rules + assert auditor.should_exclude("/my/path/.git/config") == True + assert auditor.should_exclude("/my/path/node_modules/index.js") == True + + # Files missing from explicit Extension Filter SHOULD be excluded + assert auditor.should_exclude("/my/path/src/main.py") == True + assert auditor.should_exclude("/my/path/document.pdf") == True + + # Acceptable files based on the mock filter + assert auditor.should_exclude("/my/path/photo.jpg") == False + assert auditor.should_exclude("/my/path/video.mp4") == False diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..7a29a88 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,31 @@ +import os +import json +import subprocess + +def test_auditor_config_syntax(): + """Validates that auditor_config.json is valid JSON and contains required keys.""" + config_path = os.path.join(os.path.dirname(__file__), '..', 'auditor_config.json') + assert os.path.exists(config_path), "auditor_config.json does not exist" + + with open(config_path, 'r') as f: + # Note: If this file has broken JSON syntax, json.load will throw a + # JSONDecodeError and correctly fail the Pytest suite! + config = json.load(f) + + assert 'DB_NAME' in config + assert isinstance(config.get('EXCLUSIONS'), list) + assert isinstance(config.get('EXT_FILTER'), list) + + +def test_bash_config_syntax(): + """Validates that config.env can be safely sourced by bash without syntax errors.""" + config_path = os.path.join(os.path.dirname(__file__), '..', 'config.env') + assert os.path.exists(config_path), "config.env does not exist" + + # Run bash -n (syntax check) on the config file + result = subprocess.run(['bash', '-n', config_path], capture_output=True, text=True) + assert result.returncode == 0, f"config.env bash syntax error: {result.stderr}" + + # Evaluate if we can source it safely + result_source = subprocess.run(['bash', '-c', f'set -e; source {config_path}'], capture_output=True, text=True) + assert result_source.returncode == 0, f"Error sourcing config.env at runtime: {result_source.stderr}"