From 7fe10e9277a30a4ba77d909a0f7e2377282f43b1 Mon Sep 17 00:00:00 2001 From: thismanyboyfriends2 Date: Tue, 3 Feb 2026 15:42:20 +0000 Subject: [PATCH 1/7] feat: first pass at performer url cleanup --- .../performer-url-cleanup.yml | 17 ++ .../performer_url_cleanup.py | 246 ++++++++++++++++++ 2 files changed, 263 insertions(+) create mode 100644 plugins/performer-url-cleanup/performer-url-cleanup.yml create mode 100644 plugins/performer-url-cleanup/performer_url_cleanup.py diff --git a/plugins/performer-url-cleanup/performer-url-cleanup.yml b/plugins/performer-url-cleanup/performer-url-cleanup.yml new file mode 100644 index 0000000..4cefe2c --- /dev/null +++ b/plugins/performer-url-cleanup/performer-url-cleanup.yml @@ -0,0 +1,17 @@ +name: Performer URL Cleanup +description: Normalises, deduplicates, and sorts performer URLs +version: 1.0.0 +url: https://github.com/thismanyboyfriends2/stash-plugins +exec: + - python + - "{pluginDir}/performer_url_cleanup.py" +interface: raw +tasks: + - name: Preview URL Cleanup + description: Shows what URL changes would be made without applying them + defaultArgs: + mode: preview + - name: Apply URL Cleanup + description: Normalises, deduplicates, and sorts all performer URLs + defaultArgs: + mode: apply diff --git a/plugins/performer-url-cleanup/performer_url_cleanup.py b/plugins/performer-url-cleanup/performer_url_cleanup.py new file mode 100644 index 0000000..35c0a66 --- /dev/null +++ b/plugins/performer-url-cleanup/performer_url_cleanup.py @@ -0,0 +1,246 @@ +"""Performer URL Cleanup Plugin for Stash. + +Normalises, deduplicates, and sorts performer URLs. +""" +import json +import sys +from urllib.parse import urlparse, urlunparse + +try: + import stashapi.log as log + from stashapi.stashapp import StashInterface +except ModuleNotFoundError: + print(json.dumps({ + "output": "Error: stashapp-tools not installed. Run: pip install stashapp-tools" + })) + sys.exit(1) + +# Sites that should not have www prefix +REMOVE_WWW = { + 'x.com', + 'twitter.com', + 'onlyfans.com', + 'instagram.com', + 'fansly.com', + 'pornhub.com', + 'xvideos.com', + 'xhamster.com', +} + +# Domain aliases - map old domains to canonical ones +DOMAIN_ALIASES = { + 'twitter.com': 'x.com', +} + +# Sites that preserve user's chosen capitalisation in the path +PRESERVE_CASE = {'x.com', 'twitter.com'} + + +def normalise_url(url): + """Normalise a URL according to site-specific rules. + + Returns (normalised_url, canonical_domain) tuple. + """ + # Parse the URL + parsed = urlparse(url) + + # Upgrade to HTTPS + scheme = 'https' + + # Normalise domain + domain = parsed.netloc.lower() + + # Remove www if site doesn't use it + if domain.startswith('www.'): + domain_without_www = domain[4:] + if domain_without_www in REMOVE_WWW: + domain = domain_without_www + + # Apply domain aliases + if domain in DOMAIN_ALIASES: + domain = DOMAIN_ALIASES[domain] + + # Handle path + path = parsed.path + + # Remove trailing slash + if path.endswith('/') and len(path) > 1: + path = path.rstrip('/') + + # Case handling - lowercase path unless site preserves case + if domain not in PRESERVE_CASE: + path = path.lower() + + # Reconstruct URL + normalised = urlunparse((scheme, domain, path, '', '', '')) + + return normalised, domain + + +def get_canonical_url(urls): + """Given a list of equivalent URLs, return the canonical one. + + For PRESERVE_CASE sites, keeps the first occurrence. + For others, returns the normalised (lowercase) version. + """ + if not urls: + return None + + # All URLs should normalise to the same thing + # Return the first one's normalised form + return urls[0] + + +def deduplicate_and_sort(urls): + """Normalise, deduplicate, and sort URLs. + + Returns (new_urls, changes) where changes is a list of change descriptions. + """ + if not urls: + return [], [] + + changes = [] + seen = {} # normalised_lower -> (normalised_url, original_url) + + for url in urls: + normalised, domain = normalise_url(url) + normalised_lower = normalised.lower() + + if normalised_lower in seen: + # Duplicate found + existing_normalised, existing_original = seen[normalised_lower] + changes.append(f"Remove duplicate: {url} (same as {existing_original})") + else: + seen[normalised_lower] = (normalised, url) + if normalised != url: + changes.append(f"Normalise: {url} -> {normalised}") + + # Extract normalised URLs and sort by domain + result_urls = [] + for normalised, original in seen.values(): + result_urls.append(normalised) + + # Sort by domain, then full URL + def sort_key(url): + parsed = urlparse(url) + return (parsed.netloc.lower(), url.lower()) + + sorted_urls = sorted(result_urls, key=sort_key) + + # Check if order changed + original_normalised = [normalise_url(u)[0] for u in urls if normalise_url(u)[0].lower() in {u.lower() for u in result_urls}] + if sorted_urls != list(dict.fromkeys(original_normalised)): # Remove dups preserving order + changes.append("Reordered URLs alphabetically by domain") + + return sorted_urls, changes + + +def process_performers(stash, dry_run=True): + """Process all performers and clean up their URLs.""" + # Fetch all performers with URLs + log.info("Fetching performers with URLs...") + + result = stash.find_performers( + f={}, + fragment="id name urls", + get_count=True + ) + + if not result: + log.info("No performers found") + return + + count, performers = result + log.info(f"Found {count} performers to check") + + performers_to_update = [] + + for idx, performer in enumerate(performers): + urls = performer.get('urls') or [] + + if not urls: + continue + + new_urls, changes = deduplicate_and_sort(urls) + + if changes: + performers_to_update.append({ + 'id': performer['id'], + 'name': performer['name'], + 'old_urls': urls, + 'new_urls': new_urls, + 'changes': changes + }) + + # Update progress + if count > 0: + log.progress((idx + 1) / count) + + # Report results + if not performers_to_update: + log.info("No URL changes needed - all performers are already clean") + return + + log.info(f"\n{'=' * 60}") + log.info(f"Found {len(performers_to_update)} performers with URL changes:") + log.info(f"{'=' * 60}\n") + + for p in performers_to_update: + log.info(f"Performer: {p['name']} (ID: {p['id']})") + for change in p['changes']: + log.info(f" - {change}") + log.info(f" Final URLs:") + for url in p['new_urls']: + log.info(f" - {url}") + log.info("") + + if dry_run: + log.info(f"{'=' * 60}") + log.info(f"PREVIEW MODE - No changes applied") + log.info(f"Run 'Apply URL Cleanup' to apply these changes") + log.info(f"{'=' * 60}") + else: + log.info(f"Applying changes to {len(performers_to_update)} performers...") + + for idx, p in enumerate(performers_to_update): + try: + stash.update_performer({ + 'id': p['id'], + 'urls': p['new_urls'] + }) + log.debug(f"Updated {p['name']}") + except Exception as e: + log.error(f"Failed to update {p['name']}: {e}") + + log.progress((idx + 1) / len(performers_to_update)) + + log.info(f"{'=' * 60}") + log.info(f"Applied URL cleanup to {len(performers_to_update)} performers") + log.info(f"{'=' * 60}") + + +def main(): + """Main entry point.""" + # Read JSON input from Stash + json_input = json.loads(sys.stdin.read()) + + # Extract connection info and initialise client + server_connection = json_input["server_connection"] + stash = StashInterface(server_connection) + + # Get mode from args + mode = json_input.get("args", {}).get("mode", "preview") + + log.info(f"Performer URL Cleanup - Mode: {mode}") + log.info("") + + if mode == "preview": + process_performers(stash, dry_run=True) + elif mode == "apply": + process_performers(stash, dry_run=False) + else: + log.error(f"Unknown mode: {mode}") + + +if __name__ == "__main__": + main() From 24548e79a3233eff968d1827c19ac9c5cf3052a6 Mon Sep 17 00:00:00 2001 From: thismanyboyfriends2 Date: Tue, 3 Feb 2026 16:33:24 +0000 Subject: [PATCH 2/7] feat: copy stashbox urls plugin --- plugins/copy-stashbox-urls | 1 + 1 file changed, 1 insertion(+) create mode 160000 plugins/copy-stashbox-urls diff --git a/plugins/copy-stashbox-urls b/plugins/copy-stashbox-urls new file mode 160000 index 0000000..059eecc --- /dev/null +++ b/plugins/copy-stashbox-urls @@ -0,0 +1 @@ +Subproject commit 059eeccc89c84e80f5ddf90e435b567945be0fba From b61309580686e5c060b101c8eb88bf7654aca256 Mon Sep 17 00:00:00 2001 From: thismanyboyfriends2 Date: Tue, 3 Feb 2026 16:37:23 +0000 Subject: [PATCH 3/7] fix: fixing performer url cleanup plugin --- .../performer-url-cleanup.yml | 2 +- .../performer_url_cleanup.py | 19 ++----------------- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/plugins/performer-url-cleanup/performer-url-cleanup.yml b/plugins/performer-url-cleanup/performer-url-cleanup.yml index 4cefe2c..40613db 100644 --- a/plugins/performer-url-cleanup/performer-url-cleanup.yml +++ b/plugins/performer-url-cleanup/performer-url-cleanup.yml @@ -3,7 +3,7 @@ description: Normalises, deduplicates, and sorts performer URLs version: 1.0.0 url: https://github.com/thismanyboyfriends2/stash-plugins exec: - - python + - python3 - "{pluginDir}/performer_url_cleanup.py" interface: raw tasks: diff --git a/plugins/performer-url-cleanup/performer_url_cleanup.py b/plugins/performer-url-cleanup/performer_url_cleanup.py index 35c0a66..549e5ed 100644 --- a/plugins/performer-url-cleanup/performer_url_cleanup.py +++ b/plugins/performer-url-cleanup/performer_url_cleanup.py @@ -77,20 +77,6 @@ def normalise_url(url): return normalised, domain -def get_canonical_url(urls): - """Given a list of equivalent URLs, return the canonical one. - - For PRESERVE_CASE sites, keeps the first occurrence. - For others, returns the normalised (lowercase) version. - """ - if not urls: - return None - - # All URLs should normalise to the same thing - # Return the first one's normalised form - return urls[0] - - def deduplicate_and_sort(urls): """Normalise, deduplicate, and sort URLs. @@ -127,9 +113,8 @@ def sort_key(url): sorted_urls = sorted(result_urls, key=sort_key) - # Check if order changed - original_normalised = [normalise_url(u)[0] for u in urls if normalise_url(u)[0].lower() in {u.lower() for u in result_urls}] - if sorted_urls != list(dict.fromkeys(original_normalised)): # Remove dups preserving order + # Check if order changed (result_urls preserves original order after dedup) + if result_urls != sorted_urls: changes.append("Reordered URLs alphabetically by domain") return sorted_urls, changes From a0a726dc9abda87e84c795cb3f58fec6f2b8127b Mon Sep 17 00:00:00 2001 From: thismanyboyfriends2 Date: Tue, 3 Feb 2026 18:16:50 +0000 Subject: [PATCH 4/7] feat: add site-specific URL rules and debug output - Add known domains system to limit changes to configured sites - Add debug files for reviewing changes and potential matches - Add www handling, HTTP-only sites, path transforms, suffix removal - Preserve case by default, only lowercase known case-insensitive sites --- .../performer_url_cleanup.py | 297 +++++++++++++++--- 1 file changed, 252 insertions(+), 45 deletions(-) diff --git a/plugins/performer-url-cleanup/performer_url_cleanup.py b/plugins/performer-url-cleanup/performer_url_cleanup.py index 549e5ed..1ba82df 100644 --- a/plugins/performer-url-cleanup/performer_url_cleanup.py +++ b/plugins/performer-url-cleanup/performer_url_cleanup.py @@ -4,8 +4,15 @@ """ import json import sys +from collections import defaultdict from urllib.parse import urlparse, urlunparse +# Debug output paths (temporary) +DEBUG_DIR = r"C:\stash" +DEBUG_BY_PERFORMER = f"{DEBUG_DIR}\\url_cleanup_by_performer.txt" +DEBUG_BY_DOMAIN = f"{DEBUG_DIR}\\url_cleanup_by_domain.txt" +DEBUG_POTENTIAL = f"{DEBUG_DIR}\\url_cleanup_potential.txt" + try: import stashapi.log as log from stashapi.stashapp import StashInterface @@ -20,11 +27,15 @@ 'x.com', 'twitter.com', 'onlyfans.com', - 'instagram.com', 'fansly.com', + 'xhamster.com', +} + +# Sites that should have www prefix added +ADD_WWW = { + 'instagram.com', 'pornhub.com', 'xvideos.com', - 'xhamster.com', } # Domain aliases - map old domains to canonical ones @@ -32,8 +43,66 @@ 'twitter.com': 'x.com', } -# Sites that preserve user's chosen capitalisation in the path -PRESERVE_CASE = {'x.com', 'twitter.com'} +# Sites where path is case-insensitive (safe to lowercase) +# Default behaviour: preserve original case +LOWERCASE_PATH = { + 'onlyfans.com', + 'instagram.com', +} + +# Sites that don't support HTTPS (keep as HTTP) +HTTP_ONLY = { + 'bustybuffy.com', + 'www.bustybuffy.com', +} + +# Path transformations - (domain, old_prefix, new_prefix) +PATH_TRANSFORMS = [ + ('eastcoasttalents.com', '/site/talent/', '/talent/'), +] + +# Path suffixes to remove - (domain, suffix) +REMOVE_PATH_SUFFIX = [ + ('fansly.com', '/posts'), +] + +# Sites that require trailing slashes +KEEP_TRAILING_SLASH = { + 'adultfilmdatabase.com', + 'www.adultfilmdatabase.com', +} + + +def get_known_domains(): + """Build set of all domains we have explicit rules for.""" + known = set() + known.update(REMOVE_WWW) + known.update(ADD_WWW) + known.update(DOMAIN_ALIASES.keys()) + known.update(LOWERCASE_PATH) + known.update(HTTP_ONLY) + known.update(KEEP_TRAILING_SLASH) + for domain, _, _ in PATH_TRANSFORMS: + known.add(domain) + for domain, _ in REMOVE_PATH_SUFFIX: + known.add(domain) + # Also add www variants + www_variants = {f'www.{d}' for d in known if not d.startswith('www.')} + known.update(www_variants) + return known + + +KNOWN_DOMAINS = get_known_domains() + + +def is_known_domain(domain): + """Check if domain has explicit rules configured.""" + d = domain.lower() + if d in KNOWN_DOMAINS: + return True + if d.startswith('www.') and d[4:] in KNOWN_DOMAINS: + return True + return False def normalise_url(url): @@ -41,21 +110,29 @@ def normalise_url(url): Returns (normalised_url, canonical_domain) tuple. """ + # Ensure URL has a scheme before parsing (urlparse needs it to identify netloc) + if not url.startswith(('http://', 'https://')): + url = 'https://' + url + # Parse the URL parsed = urlparse(url) - # Upgrade to HTTPS - scheme = 'https' - - # Normalise domain + # Normalise domain (need this early to check HTTP_ONLY) domain = parsed.netloc.lower() + # Upgrade to HTTPS unless site doesn't support it + scheme = 'http' if domain in HTTP_ONLY else 'https' + # Remove www if site doesn't use it if domain.startswith('www.'): domain_without_www = domain[4:] if domain_without_www in REMOVE_WWW: domain = domain_without_www + # Add www if site requires it + if not domain.startswith('www.') and domain in ADD_WWW: + domain = 'www.' + domain + # Apply domain aliases if domain in DOMAIN_ALIASES: domain = DOMAIN_ALIASES[domain] @@ -63,48 +140,161 @@ def normalise_url(url): # Handle path path = parsed.path - # Remove trailing slash - if path.endswith('/') and len(path) > 1: + # Apply path transformations + for transform_domain, old_prefix, new_prefix in PATH_TRANSFORMS: + if domain == transform_domain and path.startswith(old_prefix): + path = new_prefix + path[len(old_prefix):] + break + + # Remove path suffixes + for suffix_domain, suffix in REMOVE_PATH_SUFFIX: + if domain == suffix_domain and path.endswith(suffix): + path = path[:-len(suffix)] + break + + # Remove trailing slash (unless site requires it) + if path.endswith('/') and domain not in KEEP_TRAILING_SLASH: path = path.rstrip('/') - # Case handling - lowercase path unless site preserves case - if domain not in PRESERVE_CASE: + # Case handling - only lowercase if site is known to be case-insensitive + if domain in LOWERCASE_PATH: path = path.lower() - # Reconstruct URL - normalised = urlunparse((scheme, domain, path, '', '', '')) + # Reconstruct URL (preserve query string, drop fragment) + normalised = urlunparse((scheme, domain, path, parsed.params, parsed.query, '')) return normalised, domain +def write_debug_files(performers_to_update): + """Write debug output files for analysis.""" + # Per-performer output (confirmed changes only) + with open(DEBUG_BY_PERFORMER, 'w', encoding='utf-8') as f: + for p in performers_to_update: + if not p['changes']: + continue + f.write(f"{'=' * 60}\n") + f.write(f"Performer: {p['name']} (ID: {p['id']})\n") + f.write(f"{'=' * 60}\n") + f.write("Original URLs:\n") + for url in p['old_urls']: + f.write(f" {url}\n") + f.write("\nChanges:\n") + for change in p['changes']: + f.write(f" - {change}\n") + f.write("\nFinal URLs:\n") + for url in p['new_urls']: + f.write(f" {url}\n") + f.write("\n") + + # Per-domain output - group confirmed changes by domain + domain_changes = defaultdict(list) + for p in performers_to_update: + for url in p['old_urls']: + normalised, domain = normalise_url(url) + if normalised != url and is_known_domain(domain): + domain_changes[domain].append({ + 'performer': p['name'], + 'original': url, + 'normalised': normalised + }) + + with open(DEBUG_BY_DOMAIN, 'w', encoding='utf-8') as f: + for domain in sorted(domain_changes.keys()): + changes = domain_changes[domain] + f.write(f"{'=' * 60}\n") + f.write(f"Domain: {domain} ({len(changes)} changes)\n") + f.write(f"{'=' * 60}\n") + for c in changes: + f.write(f"[{c['performer']}]\n") + f.write(f" {c['original']}\n") + f.write(f" -> {c['normalised']}\n") + f.write("\n") + + # Potential changes - unknown domains grouped by domain + potential_by_domain = defaultdict(list) + for p in performers_to_update: + for url in p['old_urls']: + normalised, domain = normalise_url(url) + if normalised != url and not is_known_domain(domain): + potential_by_domain[domain].append({ + 'performer': p['name'], + 'original': url, + 'normalised': normalised + }) + + with open(DEBUG_POTENTIAL, 'w', encoding='utf-8') as f: + f.write("POTENTIAL CHANGES - Unknown domains (review and add rules as needed)\n") + f.write(f"{'=' * 60}\n\n") + for domain in sorted(potential_by_domain.keys()): + changes = potential_by_domain[domain] + f.write(f"{'=' * 60}\n") + f.write(f"Domain: {domain} ({len(changes)} potential changes)\n") + f.write(f"{'=' * 60}\n") + for c in changes: + f.write(f"[{c['performer']}]\n") + f.write(f" {c['original']}\n") + f.write(f" -> {c['normalised']}\n") + f.write("\n") + + +def has_mixed_case(url): + """Check if URL path has mixed case (likely from scraper, more accurate).""" + parsed = urlparse(url) + path = parsed.path + return path != path.lower() and path != path.upper() + + def deduplicate_and_sort(urls): """Normalise, deduplicate, and sort URLs. - Returns (new_urls, changes) where changes is a list of change descriptions. + Only applies changes to known domains. Unknown domain changes go to potential list. + Returns (new_urls, changes, potential_changes). """ if not urls: - return [], [] + return [], [], [] changes = [] - seen = {} # normalised_lower -> (normalised_url, original_url) + potential_changes = [] + seen = {} # normalised_lower -> (normalised_url, original_url, domain, is_known) for url in urls: normalised, domain = normalise_url(url) normalised_lower = normalised.lower() + known = is_known_domain(domain) if normalised_lower in seen: - # Duplicate found - existing_normalised, existing_original = seen[normalised_lower] - changes.append(f"Remove duplicate: {url} (same as {existing_original})") + # Duplicate found - prefer mixed case version (likely from scraper) + existing_normalised, existing_original, existing_domain, existing_known = seen[normalised_lower] + if has_mixed_case(normalised) and not has_mixed_case(existing_normalised): + seen[normalised_lower] = (normalised, url, domain, known) + msg = f"Remove duplicate: {existing_original} (prefer mixed-case {url})" + if known or existing_known: + changes.append(msg) + else: + potential_changes.append(msg) + else: + msg = f"Remove duplicate: {url} (same as {existing_original})" + if known or existing_known: + changes.append(msg) + else: + potential_changes.append(msg) else: - seen[normalised_lower] = (normalised, url) + seen[normalised_lower] = (normalised, url, domain, known) if normalised != url: - changes.append(f"Normalise: {url} -> {normalised}") + msg = f"Normalise: {url} -> {normalised}" + if known: + changes.append(msg) + else: + potential_changes.append(msg) - # Extract normalised URLs and sort by domain + # Build result - only apply normalisations for known domains result_urls = [] - for normalised, original in seen.values(): - result_urls.append(normalised) + for normalised, original, domain, known in seen.values(): + if known: + result_urls.append(normalised) + else: + result_urls.append(original) # Keep original for unknown domains # Sort by domain, then full URL def sort_key(url): @@ -117,7 +307,7 @@ def sort_key(url): if result_urls != sorted_urls: changes.append("Reordered URLs alphabetically by domain") - return sorted_urls, changes + return sorted_urls, changes, potential_changes def process_performers(stash, dry_run=True): @@ -146,15 +336,16 @@ def process_performers(stash, dry_run=True): if not urls: continue - new_urls, changes = deduplicate_and_sort(urls) + new_urls, changes, potential_changes = deduplicate_and_sort(urls) - if changes: + if changes or potential_changes: performers_to_update.append({ 'id': performer['id'], 'name': performer['name'], 'old_urls': urls, 'new_urls': new_urls, - 'changes': changes + 'changes': changes, + 'potential_changes': potential_changes, }) # Update progress @@ -166,28 +357,44 @@ def process_performers(stash, dry_run=True): log.info("No URL changes needed - all performers are already clean") return - log.info(f"\n{'=' * 60}") - log.info(f"Found {len(performers_to_update)} performers with URL changes:") - log.info(f"{'=' * 60}\n") + # Write debug files + write_debug_files(performers_to_update) + log.info(f"Debug files written to {DEBUG_DIR}") - for p in performers_to_update: - log.info(f"Performer: {p['name']} (ID: {p['id']})") - for change in p['changes']: - log.info(f" - {change}") - log.info(f" Final URLs:") - for url in p['new_urls']: - log.info(f" - {url}") - log.info("") + # Filter to only performers with confirmed changes + performers_with_changes = [p for p in performers_to_update if p['changes']] + performers_with_potential = [p for p in performers_to_update if p['potential_changes']] + + log.info(f"Found {len(performers_with_changes)} performers with confirmed changes") + log.info(f"Found {len(performers_with_potential)} performers with potential changes (see {DEBUG_POTENTIAL})") + + if performers_with_changes: + log.info(f"\n{'=' * 60}") + log.info(f"Confirmed changes:") + log.info(f"{'=' * 60}\n") + + for p in performers_with_changes: + log.info(f"Performer: {p['name']} (ID: {p['id']})") + for change in p['changes']: + log.info(f" - {change}") + log.info(f" Final URLs:") + for url in p['new_urls']: + log.info(f" - {url}") if dry_run: log.info(f"{'=' * 60}") log.info(f"PREVIEW MODE - No changes applied") - log.info(f"Run 'Apply URL Cleanup' to apply these changes") + if performers_with_changes: + log.info(f"Run 'Apply URL Cleanup' to apply {len(performers_with_changes)} confirmed changes") log.info(f"{'=' * 60}") else: - log.info(f"Applying changes to {len(performers_to_update)} performers...") + if not performers_with_changes: + log.info("No confirmed changes to apply") + return + + log.info(f"Applying changes to {len(performers_with_changes)} performers...") - for idx, p in enumerate(performers_to_update): + for idx, p in enumerate(performers_with_changes): try: stash.update_performer({ 'id': p['id'], @@ -197,10 +404,10 @@ def process_performers(stash, dry_run=True): except Exception as e: log.error(f"Failed to update {p['name']}: {e}") - log.progress((idx + 1) / len(performers_to_update)) + log.progress((idx + 1) / len(performers_with_changes)) log.info(f"{'=' * 60}") - log.info(f"Applied URL cleanup to {len(performers_to_update)} performers") + log.info(f"Applied URL cleanup to {len(performers_with_changes)} performers") log.info(f"{'=' * 60}") From fe5afd28582bd89155afc73ac8bfb2dc1f03be37 Mon Sep 17 00:00:00 2001 From: thismanyboyfriends2 Date: Tue, 3 Feb 2026 18:34:12 +0000 Subject: [PATCH 5/7] feat: toggle for output files --- .../performer-url-cleanup.yml | 5 ++ .../performer_url_cleanup.py | 76 ++++++++++++------- 2 files changed, 53 insertions(+), 28 deletions(-) diff --git a/plugins/performer-url-cleanup/performer-url-cleanup.yml b/plugins/performer-url-cleanup/performer-url-cleanup.yml index 40613db..cc3153c 100644 --- a/plugins/performer-url-cleanup/performer-url-cleanup.yml +++ b/plugins/performer-url-cleanup/performer-url-cleanup.yml @@ -6,6 +6,11 @@ exec: - python3 - "{pluginDir}/performer_url_cleanup.py" interface: raw +settings: + writeDebugFiles: + displayName: Write debug files + description: Output debug text files to the plugin directory for reviewing changes + type: BOOLEAN tasks: - name: Preview URL Cleanup description: Shows what URL changes would be made without applying them diff --git a/plugins/performer-url-cleanup/performer_url_cleanup.py b/plugins/performer-url-cleanup/performer_url_cleanup.py index 1ba82df..9b2a621 100644 --- a/plugins/performer-url-cleanup/performer_url_cleanup.py +++ b/plugins/performer-url-cleanup/performer_url_cleanup.py @@ -5,13 +5,19 @@ import json import sys from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed from urllib.parse import urlparse, urlunparse -# Debug output paths (temporary) -DEBUG_DIR = r"C:\stash" -DEBUG_BY_PERFORMER = f"{DEBUG_DIR}\\url_cleanup_by_performer.txt" -DEBUG_BY_DOMAIN = f"{DEBUG_DIR}\\url_cleanup_by_domain.txt" -DEBUG_POTENTIAL = f"{DEBUG_DIR}\\url_cleanup_potential.txt" +# Number of parallel threads for updates +PARALLEL_WORKERS = 10 + +import os + +# Debug output paths - written to plugin directory +PLUGIN_DIR = os.path.dirname(os.path.realpath(__file__)) +DEBUG_BY_PERFORMER = os.path.join(PLUGIN_DIR, "debug_by_performer.txt") +DEBUG_BY_DOMAIN = os.path.join(PLUGIN_DIR, "debug_by_domain.txt") +DEBUG_POTENTIAL = os.path.join(PLUGIN_DIR, "debug_potential.txt") try: import stashapi.log as log @@ -310,7 +316,7 @@ def sort_key(url): return sorted_urls, changes, potential_changes -def process_performers(stash, dry_run=True): +def process_performers(stash, dry_run=True, write_debug=False): """Process all performers and clean up their URLs.""" # Fetch all performers with URLs log.info("Fetching performers with URLs...") @@ -357,16 +363,17 @@ def process_performers(stash, dry_run=True): log.info("No URL changes needed - all performers are already clean") return - # Write debug files - write_debug_files(performers_to_update) - log.info(f"Debug files written to {DEBUG_DIR}") - # Filter to only performers with confirmed changes performers_with_changes = [p for p in performers_to_update if p['changes']] performers_with_potential = [p for p in performers_to_update if p['potential_changes']] log.info(f"Found {len(performers_with_changes)} performers with confirmed changes") - log.info(f"Found {len(performers_with_potential)} performers with potential changes (see {DEBUG_POTENTIAL})") + log.info(f"Found {len(performers_with_potential)} performers with potential changes") + + # Write debug files if enabled + if write_debug: + write_debug_files(performers_to_update) + log.info(f"Debug files written to {PLUGIN_DIR}") if performers_with_changes: log.info(f"\n{'=' * 60}") @@ -392,22 +399,32 @@ def process_performers(stash, dry_run=True): log.info("No confirmed changes to apply") return - log.info(f"Applying changes to {len(performers_with_changes)} performers...") - - for idx, p in enumerate(performers_with_changes): - try: - stash.update_performer({ - 'id': p['id'], - 'urls': p['new_urls'] - }) - log.debug(f"Updated {p['name']}") - except Exception as e: - log.error(f"Failed to update {p['name']}: {e}") - - log.progress((idx + 1) / len(performers_with_changes)) + log.info(f"Applying changes to {len(performers_with_changes)} performers using {PARALLEL_WORKERS} workers...") + + completed = 0 + failed = 0 + total = len(performers_with_changes) + + def update_performer(p): + stash.update_performer({'id': p['id'], 'urls': p['new_urls']}) + return p['name'] + + with ThreadPoolExecutor(max_workers=PARALLEL_WORKERS) as executor: + futures = {executor.submit(update_performer, p): p for p in performers_with_changes} + for future in as_completed(futures): + p = futures[future] + try: + future.result() + completed += 1 + if completed % 100 == 0 or completed == total: + log.info(f"Progress: {completed}/{total} performers updated") + except Exception as e: + log.error(f"Failed to update {p['name']}: {e}") + failed += 1 + log.progress((completed + failed) / total) log.info(f"{'=' * 60}") - log.info(f"Applied URL cleanup to {len(performers_with_changes)} performers") + log.info(f"Applied URL cleanup to {completed} performers ({failed} failed)") log.info(f"{'=' * 60}") @@ -423,13 +440,16 @@ def main(): # Get mode from args mode = json_input.get("args", {}).get("mode", "preview") + # Get settings + write_debug = json_input.get("server_connection", {}).get("PluginDir") and \ + stash.get_configuration().get("plugins", {}).get("performer-url-cleanup", {}).get("writeDebugFiles", False) + log.info(f"Performer URL Cleanup - Mode: {mode}") - log.info("") if mode == "preview": - process_performers(stash, dry_run=True) + process_performers(stash, dry_run=True, write_debug=write_debug) elif mode == "apply": - process_performers(stash, dry_run=False) + process_performers(stash, dry_run=False, write_debug=write_debug) else: log.error(f"Unknown mode: {mode}") From a956cd58032daf407210c781d5dc23efe8f120f9 Mon Sep 17 00:00:00 2001 From: thismanyboyfriends2 Date: Tue, 3 Feb 2026 18:39:42 +0000 Subject: [PATCH 6/7] ci: adding in claude review PR step --- .github/workflows/claude-review.yml | 41 +++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 .github/workflows/claude-review.yml diff --git a/.github/workflows/claude-review.yml b/.github/workflows/claude-review.yml new file mode 100644 index 0000000..fcfdf36 --- /dev/null +++ b/.github/workflows/claude-review.yml @@ -0,0 +1,41 @@ +name: Claude PR Review + +on: + pull_request: + types: [opened, synchronize] + branches: + - main + issue_comment: + types: [created] + pull_request_review_comment: + types: [created] + +jobs: + review: + runs-on: ubuntu-latest + # Only run on PRs targeting main, or on @claude mentions + if: | + (github.event_name == 'pull_request') || + (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) + permissions: + contents: read + pull-requests: write + issues: write + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: anthropics/claude-code-action@v1 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} + prompt: | + Review this PR for: + 1. Code quality and best practices + 2. Potential bugs or edge cases + 3. Security concerns + 4. Alignment with Stash plugin conventions (YAML metadata, Python/JS patterns) + + Be concise and actionable. Focus on substantive issues rather than style nitpicks. From 56b1e14daaf5861254807c6110c6246461b8e1c9 Mon Sep 17 00:00:00 2001 From: thismanyboyfriends2 Date: Tue, 3 Feb 2026 18:52:25 +0000 Subject: [PATCH 7/7] ci: clamping down permissions --- .github/workflows/claude-review.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/claude-review.yml b/.github/workflows/claude-review.yml index fcfdf36..9c80a9a 100644 --- a/.github/workflows/claude-review.yml +++ b/.github/workflows/claude-review.yml @@ -13,11 +13,13 @@ on: jobs: review: runs-on: ubuntu-latest - # Only run on PRs targeting main, or on @claude mentions + # Only run for repo owner if: | - (github.event_name == 'pull_request') || - (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || - (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) + (github.event_name == 'pull_request' && github.event.pull_request.author_association == 'OWNER') || + (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude') && + github.event.comment.author_association == 'OWNER') || + (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude') && + github.event.comment.author_association == 'OWNER') permissions: contents: read pull-requests: write