jadia · jadia · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026 · chatgpt-codex-connector
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,28 @@
+name: Backup Utility CI
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.10"
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pytest
+
+    - name: Run Pytest Validation Suite
+      run: |
+        pytest -v tests/
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,6 @@
 logs/**
 *.db
 *.db-journal
-*.json
 __pycache__
 *.py[cod]
 *$py.class

diff --git a/README.md b/README.md
@@ -22,13 +22,21 @@ Ensure you have the following installed on your Ubuntu/Linux system:
 * `python3`
 * `sqlite3`
 
-### 2. Configuration (`config.env`)
-Before running the utility, open `config.env` and define:
+### 2. Configuration
+Before running the utility, you must configure two files:
+
+**`config.env`**:
+Open this file to map your fundamental hardware topology. You must define:
 1. The **UUIDs** of your hard drives.
 2. The **mount points** where they will be attached.
 3. The **sync destination paths** mapping how you cascade your data (e.g., 1TB -> 2TB).
 4. Thresholds for safe-sync warnings.
 
+**`auditor_config.json`**:
+Open this JSON file to customize the Integrity Auditor's parameters:
+1. **`EXT_FILTER`**: Tailor this array with specific file extensions you want protected (e.g., `.jpg`, `.mp4`, `.pdf`). Any non-critical extensions omitted from this list will be intelligently ignored to drastically improve hashing speeds on mechanical drives.
+2. **`EXCLUSIONS`**: Directories completely ignored by the auditor.
+
 ### 3. Usage
 Run the interactive menu wrapper:
 ```bash

diff --git a/auditor.py b/auditor.py
@@ -18,58 +18,26 @@
 import json
 from datetime import datetime
 
-# Load configuration from config.env (simple parsing, no external deps)
-CONFIG_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.env")
+# Load configuration safely from JSON
+CONFIG_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "auditor_config.json")
 
 def load_config():
-    config = {}
     if not os.path.exists(CONFIG_FILE):
         print(f"Error: {CONFIG_FILE} not found. Please ensure it exists.")
         sys.exit(1)
 
     with open(CONFIG_FILE, 'r') as f:
-        for line in f:
-            line = line.strip()
-            if line and not line.startswith('#'):
-                # Handle bash array format for exclusions roughly
-                if line.startswith('EXCLUSIONS=('):
-                    config['EXCLUSIONS'] = []
-                    continue
-                if line.startswith('AUDITOR_EXT_FILTER=('):
-                    config['AUDITOR_EXT_FILTER'] = []
-                    continue
-
-                # If we are inside an array parsing (hacky but works for the format we defined)
-                if 'EXCLUSIONS' in config and isinstance(config['EXCLUSIONS'], list) and line == ')':
-                    # Array ended, convert to tuple
-                    continue
-                elif 'EXCLUSIONS' in config and isinstance(config['EXCLUSIONS'], list):
-                    val = line.strip(' "()')
-                    if val: config['EXCLUSIONS'].append(val)
-                    continue
-
-                if 'AUDITOR_EXT_FILTER' in config and isinstance(config['AUDITOR_EXT_FILTER'], list) and line == ')':
-                    continue
-                elif 'AUDITOR_EXT_FILTER' in config and isinstance(config['AUDITOR_EXT_FILTER'], list):
-                    val = line.strip(' "()')
-                    if val: config['AUDITOR_EXT_FILTER'].append(val.lower())
-                    continue
-
-                if '=' in line:
-                    key, val = line.split('=', 1)
-                    config[key] = val.strip(' "')
-
-    # Default fallbacks if parsing didn't catch arrays well
-    if 'EXCLUSIONS' not in config: config['EXCLUSIONS'] = []
-    if 'AUDITOR_EXT_FILTER' not in config: config['AUDITOR_EXT_FILTER'] = []
-    if 'AUDITOR_DB_NAME' not in config: config['AUDITOR_DB_NAME'] = 'auditor.db'
-
-    return config
+        try:
+            config = json.load(f)
+            return config
+        except json.JSONDecodeError as e:
+            print(f"Error parsing JSON configuration in {CONFIG_FILE}: {e}")
+            sys.exit(1)
 
 CONFIG = load_config()
 
 # Database setup
-DB_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), CONFIG.get('AUDITOR_DB_NAME', 'auditor.db'))
+DB_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), CONFIG.get('DB_NAME', 'auditor.db'))
 LOG_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "logs")
 
 # Ensure logs dir exists
@@ -127,7 +95,7 @@ def should_exclude(file_path):
             return True
 
     # 2. Check extension filter (if defined)
-    ext_filter = CONFIG.get('AUDITOR_EXT_FILTER', [])
+    ext_filter = CONFIG.get('EXT_FILTER', [])
     if ext_filter:
         _, ext = os.path.splitext(file_path)
         if ext.lower() not in ext_filter:

diff --git a/auditor_config.json b/auditor_config.json
@@ -0,0 +1,18 @@
+{
+    "DB_NAME": "auditor.db",
+    "EXCLUSIONS": [
+        ".git/",
+        "node_modules/",
+        "venv/",
+        ".cache/",
+        "tmp/",
+        "System Volume Information/",
+        "$RECYCLE.BIN/"
+    ],
+    "EXT_FILTER": [
+        ".jpg", ".jpeg", ".png", ".heic",
+        ".cr2", ".arw", ".dng", ".tif",
+        ".mp4", ".mov", ".m4v", ".3gp",
+        ".pdf"
+    ]
+}
diff --git a/config.env b/config.env
@@ -74,8 +74,6 @@ EXCLUSIONS=(
 KNOWN_DUPLICATES_JSON="known_duplicates.json"
 
 # --- Auditor Configuration ---
-# Name of the SQLite database file (stored in the backup-utility dir by default)
-AUDITOR_DB_NAME="auditor.db"
-# Types of files the auditor should hash (extensions)
-# Useful for limiting to images/videos. Empty array = hash everything not in EXCLUSIONS.
-AUDITOR_EXT_FILTER=(".jpg" ".jpeg" ".png" ".mp4" ".mkv" ".pdf" ".mp3" ".flac")
+# Note: The auditor's database name, directory exclusions, and critical 
+# hashing extension filters have been decoupled and moved to:
+# `auditor_config.json` for safer and cleaner Python parsing.
diff --git a/find_extensions.py b/find_extensions.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+import os
+import sys
+from collections import Counter
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python3 find_extensions.py <target_directory> [output_file]")
+        sys.exit(1)
+
+    target_dir = sys.argv[1]
+    output_file = sys.argv[2] if len(sys.argv) > 2 else "extension_report.log"
+
+    if not os.path.isdir(target_dir):
+        print(f"Error: {target_dir} is not a valid directory.")
+        sys.exit(1)
+
+    print(f"Scanning {target_dir} for file extensions...")
+    ext_counts = Counter()
+
+    scanned = 0
+    # Walk the directory tree
+    for root, dirs, files in os.walk(target_dir):
+        for file in files:
+            scanned += 1
+            if scanned % 10000 == 0:
+                print(f"Scanned {scanned} files...")
+
+            # Extract extension and convert to lowercase
+            _, ext = os.path.splitext(file)
+            if ext:
+                ext_counts[ext.lower()] += 1
+            else:
+                ext_counts["<no_extension>"] += 1
+
+    print(f"\nScan complete. Total files processed: {scanned}")
+
+    # Save formatted report
+    with open(output_file, 'w') as f:
+        f.write(f"Extension Report for: {target_dir}\n")
+        f.write(f"Total Files: {scanned}\n")
+        f.write("-" * 40 + "\n")
+        f.write(f"{'Extension':<20} | {'Count':<10}\n")
+        f.write("-" * 40 + "\n")
+
+        for ext, count in ext_counts.most_common():
+            f.write(f"{ext:<20} | {count:<10}\n")
+
+    print(f"Report saved to: {output_file}")
+    print("\nTop 15 most common extensions:")
+    for ext, count in ext_counts.most_common(15):
+        print(f"  {ext:<15} : {count}")
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_auditor.py b/tests/test_auditor.py
@@ -0,0 +1,44 @@
+import sys
+import os
+import tempfile
+import pytest
+
+# Ensure parent directory is in python path to import auditor
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+def test_hash_calculation():
+    from auditor import calculate_sha256
+
+    # Create temporary file to hash
+    with tempfile.NamedTemporaryFile(delete=False) as f:
+        f.write(b"backup-utility-test")
+        temp_path = f.name
+
+    try:
+        # Pre-calculated SHA256 of "backup-utility-test"
+        expected = "48f23852dc21c9a38e8ffd9f743f847b4d7945b0f4b9006f1635cab462b7fa2b"
+        assert calculate_sha256(temp_path) == expected
+    finally:
+        os.remove(temp_path)
+
+def test_should_exclude(monkeypatch):
+    import auditor
+
+    # Mock the configuration payload testing specific scenarios
+    mock_config = {
+        "EXCLUSIONS": [".git/", "node_modules/"],
+        "EXT_FILTER": [".jpg", ".mp4"]
+    }
+    monkeypatch.setattr(auditor, 'CONFIG', mock_config)
+
+    # Check directory exclusion rules
+    assert auditor.should_exclude("/my/path/.git/config") == True
+    assert auditor.should_exclude("/my/path/node_modules/index.js") == True
+
+    # Files missing from explicit Extension Filter SHOULD be excluded
+    assert auditor.should_exclude("/my/path/src/main.py") == True
+    assert auditor.should_exclude("/my/path/document.pdf") == True
+
+    # Acceptable files based on the mock filter 
+    assert auditor.should_exclude("/my/path/photo.jpg") == False
+    assert auditor.should_exclude("/my/path/video.mp4") == False
diff --git a/tests/test_config.py b/tests/test_config.py
@@ -0,0 +1,31 @@
+import os
+import json
+import subprocess
+
+def test_auditor_config_syntax():
+    """Validates that auditor_config.json is valid JSON and contains required keys."""
+    config_path = os.path.join(os.path.dirname(__file__), '..', 'auditor_config.json')
+    assert os.path.exists(config_path), "auditor_config.json does not exist"
+
+    with open(config_path, 'r') as f:
+        # Note: If this file has broken JSON syntax, json.load will throw a 
+        # JSONDecodeError and correctly fail the Pytest suite!
+        config = json.load(f)
+
+    assert 'DB_NAME' in config
+    assert isinstance(config.get('EXCLUSIONS'), list)
+    assert isinstance(config.get('EXT_FILTER'), list)
+
+
+def test_bash_config_syntax():
+    """Validates that config.env can be safely sourced by bash without syntax errors."""
+    config_path = os.path.join(os.path.dirname(__file__), '..', 'config.env')
+    assert os.path.exists(config_path), "config.env does not exist"
+
+    # Run bash -n (syntax check) on the config file
+    result = subprocess.run(['bash', '-n', config_path], capture_output=True, text=True)
+    assert result.returncode == 0, f"config.env bash syntax error: {result.stderr}"
+
+    # Evaluate if we can source it safely
+    result_source = subprocess.run(['bash', '-c', f'set -e; source {config_path}'], capture_output=True, text=True)
+    assert result_source.returncode == 0, f"Error sourcing config.env at runtime: {result_source.stderr}"