Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Backup Utility CI

on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]

jobs:
test:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3

- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: "3.10"

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest

- name: Run Pytest Validation Suite
run: |
pytest -v tests/
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
logs/**
*.db
*.db-journal
*.json
__pycache__
*.py[cod]
*$py.class
Expand Down
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,21 @@ Ensure you have the following installed on your Ubuntu/Linux system:
* `python3`
* `sqlite3`

### 2. Configuration (`config.env`)
Before running the utility, open `config.env` and define:
### 2. Configuration
Before running the utility, you must configure two files:

**`config.env`**:
Open this file to map your fundamental hardware topology. You must define:
1. The **UUIDs** of your hard drives.
2. The **mount points** where they will be attached.
3. The **sync destination paths** mapping how you cascade your data (e.g., 1TB -> 2TB).
4. Thresholds for safe-sync warnings.

**`auditor_config.json`**:
Open this JSON file to customize the Integrity Auditor's parameters:
1. **`EXT_FILTER`**: Tailor this array with specific file extensions you want protected (e.g., `.jpg`, `.mp4`, `.pdf`). Any non-critical extensions omitted from this list will be intelligently ignored to drastically improve hashing speeds on mechanical drives.
2. **`EXCLUSIONS`**: Directories completely ignored by the auditor.

### 3. Usage
Run the interactive menu wrapper:
```bash
Expand Down
52 changes: 10 additions & 42 deletions auditor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,58 +18,26 @@
import json
from datetime import datetime

# Load configuration from config.env (simple parsing, no external deps)
CONFIG_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.env")
# Load configuration safely from JSON
CONFIG_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "auditor_config.json")

def load_config():
config = {}
if not os.path.exists(CONFIG_FILE):
print(f"Error: {CONFIG_FILE} not found. Please ensure it exists.")
sys.exit(1)

with open(CONFIG_FILE, 'r') as f:
for line in f:
line = line.strip()
if line and not line.startswith('#'):
# Handle bash array format for exclusions roughly
if line.startswith('EXCLUSIONS=('):
config['EXCLUSIONS'] = []
continue
if line.startswith('AUDITOR_EXT_FILTER=('):
config['AUDITOR_EXT_FILTER'] = []
continue

# If we are inside an array parsing (hacky but works for the format we defined)
if 'EXCLUSIONS' in config and isinstance(config['EXCLUSIONS'], list) and line == ')':
# Array ended, convert to tuple
continue
elif 'EXCLUSIONS' in config and isinstance(config['EXCLUSIONS'], list):
val = line.strip(' "()')
if val: config['EXCLUSIONS'].append(val)
continue

if 'AUDITOR_EXT_FILTER' in config and isinstance(config['AUDITOR_EXT_FILTER'], list) and line == ')':
continue
elif 'AUDITOR_EXT_FILTER' in config and isinstance(config['AUDITOR_EXT_FILTER'], list):
val = line.strip(' "()')
if val: config['AUDITOR_EXT_FILTER'].append(val.lower())
continue

if '=' in line:
key, val = line.split('=', 1)
config[key] = val.strip(' "')

# Default fallbacks if parsing didn't catch arrays well
if 'EXCLUSIONS' not in config: config['EXCLUSIONS'] = []
if 'AUDITOR_EXT_FILTER' not in config: config['AUDITOR_EXT_FILTER'] = []
if 'AUDITOR_DB_NAME' not in config: config['AUDITOR_DB_NAME'] = 'auditor.db'

return config
try:
config = json.load(f)
return config
except json.JSONDecodeError as e:
print(f"Error parsing JSON configuration in {CONFIG_FILE}: {e}")
sys.exit(1)

CONFIG = load_config()

# Database setup
DB_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), CONFIG.get('AUDITOR_DB_NAME', 'auditor.db'))
DB_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), CONFIG.get('DB_NAME', 'auditor.db'))
LOG_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "logs")

# Ensure logs dir exists
Expand Down Expand Up @@ -127,7 +95,7 @@ def should_exclude(file_path):
return True

# 2. Check extension filter (if defined)
ext_filter = CONFIG.get('AUDITOR_EXT_FILTER', [])
ext_filter = CONFIG.get('EXT_FILTER', [])
if ext_filter:
_, ext = os.path.splitext(file_path)
if ext.lower() not in ext_filter:
Expand Down
18 changes: 18 additions & 0 deletions auditor_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"DB_NAME": "auditor.db",
"EXCLUSIONS": [
".git/",
"node_modules/",
"venv/",
".cache/",
"tmp/",
"System Volume Information/",
"$RECYCLE.BIN/"
],
"EXT_FILTER": [
".jpg", ".jpeg", ".png", ".heic",
".cr2", ".arw", ".dng", ".tif",
".mp4", ".mov", ".m4v", ".3gp",
".pdf"
]
}
8 changes: 3 additions & 5 deletions config.env
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,6 @@ EXCLUSIONS=(
KNOWN_DUPLICATES_JSON="known_duplicates.json"

# --- Auditor Configuration ---
# Name of the SQLite database file (stored in the backup-utility dir by default)
AUDITOR_DB_NAME="auditor.db"
# Types of files the auditor should hash (extensions)
# Useful for limiting to images/videos. Empty array = hash everything not in EXCLUSIONS.
AUDITOR_EXT_FILTER=(".jpg" ".jpeg" ".png" ".mp4" ".mkv" ".pdf" ".mp3" ".flac")
# Note: The auditor's database name, directory exclusions, and critical
# hashing extension filters have been decoupled and moved to:
# `auditor_config.json` for safer and cleaner Python parsing.
55 changes: 55 additions & 0 deletions find_extensions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env python3
import os
import sys
from collections import Counter

def main():
if len(sys.argv) < 2:
print("Usage: python3 find_extensions.py <target_directory> [output_file]")
sys.exit(1)

target_dir = sys.argv[1]
output_file = sys.argv[2] if len(sys.argv) > 2 else "extension_report.log"

if not os.path.isdir(target_dir):
print(f"Error: {target_dir} is not a valid directory.")
sys.exit(1)

print(f"Scanning {target_dir} for file extensions...")
ext_counts = Counter()

scanned = 0
# Walk the directory tree
for root, dirs, files in os.walk(target_dir):
for file in files:
scanned += 1
if scanned % 10000 == 0:
print(f"Scanned {scanned} files...")

# Extract extension and convert to lowercase
_, ext = os.path.splitext(file)
if ext:
ext_counts[ext.lower()] += 1
else:
ext_counts["<no_extension>"] += 1

print(f"\nScan complete. Total files processed: {scanned}")

# Save formatted report
with open(output_file, 'w') as f:
f.write(f"Extension Report for: {target_dir}\n")
f.write(f"Total Files: {scanned}\n")
f.write("-" * 40 + "\n")
f.write(f"{'Extension':<20} | {'Count':<10}\n")
f.write("-" * 40 + "\n")

for ext, count in ext_counts.most_common():
f.write(f"{ext:<20} | {count:<10}\n")

print(f"Report saved to: {output_file}")
print("\nTop 15 most common extensions:")
for ext, count in ext_counts.most_common(15):
print(f" {ext:<15} : {count}")

if __name__ == "__main__":
main()
44 changes: 44 additions & 0 deletions tests/test_auditor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import sys
import os
import tempfile
import pytest

# Ensure parent directory is in python path to import auditor
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

def test_hash_calculation():
from auditor import calculate_sha256

# Create temporary file to hash
with tempfile.NamedTemporaryFile(delete=False) as f:
f.write(b"backup-utility-test")
temp_path = f.name

try:
# Pre-calculated SHA256 of "backup-utility-test"
expected = "48f23852dc21c9a38e8ffd9f743f847b4d7945b0f4b9006f1635cab462b7fa2b"
assert calculate_sha256(temp_path) == expected
finally:
os.remove(temp_path)

def test_should_exclude(monkeypatch):
import auditor

# Mock the configuration payload testing specific scenarios
mock_config = {
"EXCLUSIONS": [".git/", "node_modules/"],
"EXT_FILTER": [".jpg", ".mp4"]
}
monkeypatch.setattr(auditor, 'CONFIG', mock_config)

# Check directory exclusion rules
assert auditor.should_exclude("/my/path/.git/config") == True
assert auditor.should_exclude("/my/path/node_modules/index.js") == True

# Files missing from explicit Extension Filter SHOULD be excluded
assert auditor.should_exclude("/my/path/src/main.py") == True
assert auditor.should_exclude("/my/path/document.pdf") == True

# Acceptable files based on the mock filter
assert auditor.should_exclude("/my/path/photo.jpg") == False
assert auditor.should_exclude("/my/path/video.mp4") == False
31 changes: 31 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os
import json
import subprocess

def test_auditor_config_syntax():
"""Validates that auditor_config.json is valid JSON and contains required keys."""
config_path = os.path.join(os.path.dirname(__file__), '..', 'auditor_config.json')
assert os.path.exists(config_path), "auditor_config.json does not exist"

with open(config_path, 'r') as f:
# Note: If this file has broken JSON syntax, json.load will throw a
# JSONDecodeError and correctly fail the Pytest suite!
config = json.load(f)

assert 'DB_NAME' in config
assert isinstance(config.get('EXCLUSIONS'), list)
assert isinstance(config.get('EXT_FILTER'), list)


def test_bash_config_syntax():
"""Validates that config.env can be safely sourced by bash without syntax errors."""
config_path = os.path.join(os.path.dirname(__file__), '..', 'config.env')
assert os.path.exists(config_path), "config.env does not exist"

# Run bash -n (syntax check) on the config file
result = subprocess.run(['bash', '-n', config_path], capture_output=True, text=True)
assert result.returncode == 0, f"config.env bash syntax error: {result.stderr}"

# Evaluate if we can source it safely
result_source = subprocess.run(['bash', '-c', f'set -e; source {config_path}'], capture_output=True, text=True)
assert result_source.returncode == 0, f"Error sourcing config.env at runtime: {result_source.stderr}"