From a08fe16dfa7909d25d4db9f675b0b3e1bc5ac31b Mon Sep 17 00:00:00 2001 From: deucebucket Date: Fri, 17 Apr 2026 19:39:10 -0500 Subject: [PATCH] Fix #208: Persist watch_folder_processed + honor Skaldleita server_notice Problem 1: watch_folder_processed was an in-memory set() that got wiped on every restart. Any file that couldn't be processed (unknown author, ambiguous match, move failure, mtime churn) got re-submitted every scan cycle after a restart, forever. One LM instance generated ~48% of all Skaldleita /match traffic for days on the same filename. Problem 2: Skaldleita PR #129 added a server_notice field to /match responses. When the server detects a retry loop it now sends {severity, code, message, action: abort_task, upgrade_url}. LM ignored it. Fixes: - New watch_folder_processed SQLite table (path PK, processed_at, outcome, error_message). outcome in {moved, move_failed, aborted_by_server}. - watch_folder_is_processed() / watch_folder_mark_processed() helpers in library_manager/database.py. process_watch_folder swapped from set ops to these helpers. Restart no longer resets dedup. - bookdb.py logs every server_notice (with upgrade_url). On action=abort_task it stashes the notice in a threading.local() slot so scope is per-thread (watch worker, API endpoint, pipeline layer don't cross-contaminate). - process_watch_folder reads the abort slot after each identify attempt; if set, marks the item as aborted_by_server and skips the pipeline. Bumps APP_VERSION to 0.9.0-beta.148. --- CHANGELOG.md | 28 +++++++++++++++ README.md | 6 +++- app.py | 33 ++++++++++++------ library_manager/database.py | 53 +++++++++++++++++++++++++++++ library_manager/providers/bookdb.py | 32 +++++++++++++++++ 5 files changed, 141 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 066d4ae..daeb120 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,34 @@ All notable changes to Library Manager will be documented in this file. +## [0.9.0-beta.148] - 2026-04-17 + +### Fixed + +- **Issue #208: Watch-folder retry loop survives restarts** — The watch-folder + worker used an in-memory `set()` to remember which files it had already + processed. Every LM restart wiped the set, so whenever a file couldn't be + processed (unknown author, ambiguous match, move failure, mtime churn), the + worker would re-submit it on every scan forever. Server-side evidence showed + one LM instance generating ~48% of all Skaldleita `/match` traffic — 2,840 + requests in a single day on the same filename. Fix: + - New `watch_folder_processed` SQLite table (`path`, `processed_at`, + `outcome`, `error_message`) persists dedup across restarts. `outcome` + values: `moved`, `move_failed`, `aborted_by_server`. + - Added `watch_folder_is_processed()` / `watch_folder_mark_processed()` + helpers in `library_manager/database.py`; watch worker switched from + `set()` ops to these helpers. +- **Issue #208: Skaldleita `server_notice` handler** — Skaldleita responses + can now carry a `server_notice` block (severity/code/message/action/ + upgrade_url). `library_manager/providers/bookdb.py` logs every notice + (with upgrade URL) and, on `action=abort_task`, stashes it in a + `threading.local()` slot. The watch-folder worker reads that slot after + each identify attempt and, if an abort was signalled, marks the item as + `aborted_by_server` and skips the rest of the pipeline — no 30-second + retry loop. + +--- + ## [0.9.0-beta.147] - 2026-04-17 ### Fixed diff --git a/README.md b/README.md index 0871d1d..a15d196 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ **Smart Audiobook Library Organizer with Multi-Source Metadata & AI Verification** -[![Version](https://img.shields.io/badge/version-0.9.0--beta.147-blue.svg)](CHANGELOG.md) +[![Version](https://img.shields.io/badge/version-0.9.0--beta.148-blue.svg)](CHANGELOG.md) [![Docker](https://img.shields.io/badge/docker-ghcr.io-blue.svg)](https://ghcr.io/deucebucket/library-manager) [![License](https://img.shields.io/badge/license-AGPL--3.0-blue.svg)](LICENSE) @@ -16,6 +16,10 @@ ## Recent Changes (stable) +> **beta.148** - **Fix: Watch-Folder Retry Loop Across Restarts + Skaldleita server_notice** (Issue #208) +> - **Persistent watch-folder dedup** - `watch_folder_processed` is now a SQLite table instead of an in-memory `set()`. Restarts no longer wipe it, killing the retry loop that had one LM instance hammering Skaldleita's `/match` every 30 seconds on the same file for days. +> - **Honors Skaldleita's abort signal** - When the server detects a retry loop it sends a `server_notice` in the response. LM now logs it (with an upgrade URL) and, on `action=abort_task`, stops retrying that file immediately. + > **beta.147** - **Critical Fix: Hard Link Safety** (Issue #209) > - **Stop silent copy+delete** - When "Use hard links" was enabled and the watch folder / library sat on different filesystems, LM used to copy every file and delete the originals. That broke torrent seeding and doubled disk use. Now LM fails fast with a clear error and leaves source files untouched. > - **Pre-check filesystem compatibility** - Verifies `st_dev` match before any file operations when hard links are enabled. diff --git a/app.py b/app.py index 8663f0e..6972a4c 100644 --- a/app.py +++ b/app.py @@ -11,7 +11,7 @@ - Multi-provider AI (Gemini, OpenRouter, Ollama) """ -APP_VERSION = "0.9.0-beta.147" +APP_VERSION = "0.9.0-beta.148" GITHUB_REPO = "deucebucket/library-manager" # Your GitHub repo # Versioning Guide: @@ -52,7 +52,8 @@ from library_manager.database import ( init_db, get_db, set_db_path, cleanup_garbage_entries, cleanup_duplicate_history_entries, insert_history_entry, - should_requeue_book + should_requeue_book, + watch_folder_is_processed, watch_folder_mark_processed ) from library_manager.models.book_profile import ( SOURCE_WEIGHTS, FIELD_WEIGHTS, FieldValue, BookProfile, @@ -6432,8 +6433,8 @@ def get_circuit_breaker(api_name): # WATCH FOLDER FUNCTIONALITY # ============================================================================ -# Track processed watch folder items to avoid reprocessing -watch_folder_processed = set() +# Issue #208: watch-folder dedup now lives in the watch_folder_processed +# SQLite table (see library_manager.database) so restarts don't reset state. watch_folder_last_scan = 0 def get_watch_folder_items(watch_folder: str, min_age_seconds: int = 30) -> list: @@ -6456,8 +6457,8 @@ def get_watch_folder_items(watch_folder: str, min_age_seconds: int = 30) -> list for item in watch_path.iterdir(): item_path = str(item.resolve()) - # Skip if already processed - if item_path in watch_folder_processed: + # Skip if already processed (persisted in SQLite, Issue #208) + if watch_folder_is_processed(item_path): continue # Check if folder contains audio files or is an audio file @@ -6668,7 +6669,7 @@ def process_watch_folder(config: dict) -> int: Process items in the watch folder. Returns number of items processed. """ - global watch_folder_processed, watch_folder_last_scan + global watch_folder_last_scan watch_folder = config.get('watch_folder', '').strip() output_folder = config.get('watch_output_folder', '').strip() @@ -6828,6 +6829,18 @@ def norm_conf(c): except Exception as e: logger.debug(f"Watch folder: API lookup failed, using path analysis: {e}") + # Issue #208: Skaldleita may have signalled 'abort_task' during the + # lookup above (retry-loop protection). Stop retrying this item and + # persist it so future scans skip it until the user upgrades / fixes + # the source. The warning + upgrade URL are already in the logs. + from library_manager.providers.bookdb import get_and_clear_server_abort + server_abort = get_and_clear_server_abort() + if server_abort: + abort_msg = server_abort.get('message', 'Skaldleita requested task abort') + logger.warning(f"Watch folder: Aborting '{item.name}' per Skaldleita server notice") + watch_folder_mark_processed(item_path, 'aborted_by_server', abort_msg) + continue + # Issue #57: Verify drastic author changes before accepting if needs_verification and api_author and api_title: try: @@ -6880,7 +6893,7 @@ def norm_conf(c): if success: logger.info(f"Watch folder: Moved to {new_path}") - watch_folder_processed.add(item_path) + watch_folder_mark_processed(item_path, 'moved') processed += 1 # Add to books table @@ -6914,8 +6927,8 @@ def norm_conf(c): else: logger.error(f"Watch folder: Failed to move {item.name}: {error}") # Issue #49: Track failed items in the database so user can see and fix them - # Add to watch_folder_processed to prevent infinite retry loop - watch_folder_processed.add(item_path) + # Issue #208: persist dedup so the retry loop dies across restarts too + watch_folder_mark_processed(item_path, 'move_failed', error) try: # Check if this item is already tracked c.execute('SELECT id FROM books WHERE path = ?', (item_path,)) diff --git a/library_manager/database.py b/library_manager/database.py index 62d4df6..b25e040 100644 --- a/library_manager/database.py +++ b/library_manager/database.py @@ -175,6 +175,17 @@ def init_db(db_path=None): api_calls INTEGER DEFAULT 0 )''') + # Issue #208: Persistent watch-folder dedup + # Was an in-memory set(), wiped on restart, which caused the watch worker + # to re-submit the same failing file every cycle (ate ~48% of Skaldleita + # traffic from a single LM instance before server-side cache absorbed it). + c.execute('''CREATE TABLE IF NOT EXISTS watch_folder_processed ( + path TEXT PRIMARY KEY, + processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + outcome TEXT, + error_message TEXT + )''') + conn.commit() conn.close() @@ -187,6 +198,48 @@ def init_db(db_path=None): init_plugin_metrics_table(path) +def watch_folder_is_processed(path, db_path=None): + """Return True if the watch-folder path has already been handled. + + Issue #208: replaces the in-memory set. Survives restarts so the worker + doesn't re-submit the same failing file every scan cycle. + """ + p = db_path or _db_path + if not p: + return False + conn = sqlite3.connect(p, timeout=30) + try: + c = conn.execute( + 'SELECT 1 FROM watch_folder_processed WHERE path = ? LIMIT 1', + (path,) + ) + return c.fetchone() is not None + finally: + conn.close() + + +def watch_folder_mark_processed(path, outcome, error_message=None, db_path=None): + """Record that a watch-folder path has been handled. + + outcome: 'moved' | 'move_failed' | 'unknown_author' | 'aborted_by_server' + Issue #208. + """ + p = db_path or _db_path + if not p: + return + conn = sqlite3.connect(p, timeout=30) + try: + conn.execute( + '''INSERT OR REPLACE INTO watch_folder_processed + (path, processed_at, outcome, error_message) + VALUES (?, CURRENT_TIMESTAMP, ?, ?)''', + (path, outcome, error_message) + ) + conn.commit() + finally: + conn.close() + + def cleanup_garbage_entries(db_path=None): """Remove garbage entries from database on startup. diff --git a/library_manager/providers/bookdb.py b/library_manager/providers/bookdb.py index 83e7345..d003733 100644 --- a/library_manager/providers/bookdb.py +++ b/library_manager/providers/bookdb.py @@ -15,6 +15,7 @@ import logging import subprocess import tempfile +import threading import requests from pathlib import Path @@ -33,6 +34,22 @@ logger = logging.getLogger(__name__) +# Issue #208: Skaldleita can signal "stop retrying this task" via a server_notice +# in the JSON response. We stash the notice in a thread-local so the caller +# (e.g. the watch-folder worker) can pick it up and mark the item as aborted +# without a 30-second retry loop. Thread-local keeps the signal scoped to the +# thread that issued the matching request. +_abort_state = threading.local() + + +def get_and_clear_server_abort(): + """Return (and clear) the last server_notice with action=abort_task seen + on this thread, or None. Safe to call when none was set.""" + notice = getattr(_abort_state, 'notice', None) + if notice is not None: + _abort_state.notice = None + return notice + # Skaldleita API endpoint (our metadata service, legacy name: BookDB) BOOKDB_API_URL = "https://bookdb.deucebucket.com" # URL unchanged for backwards compatibility # Public API key for Library Manager users (no config needed) @@ -168,6 +185,21 @@ def search_bookdb(title, author=None, api_key=None, retry_count=0, bookdb_url=No data = resp.json() + # Issue #208: honor Skaldleita server_notice. Log every notice; on + # action=abort_task, stash in thread-local so the watch-folder worker + # can stop retrying instead of hammering /match every 30s. + notice = data.get('server_notice') + if notice: + code = notice.get('code', 'unknown') + msg = notice.get('message', '') + upgrade_url = notice.get('upgrade_url') + severity = notice.get('severity', 'info') + logger.warning(f"[SKALDLEITA] server notice ({severity}) [{code}]: {msg}") + if upgrade_url: + logger.warning(f"[SKALDLEITA] upgrade: {upgrade_url}") + if notice.get('action') == 'abort_task': + _abort_state.notice = notice + # Check confidence threshold if data.get('confidence', 0) < 0.5: logger.debug(f"Skaldleita match below confidence threshold: {data.get('confidence')}")