From 5250708e3d824cb5307086dae83b2557d306fb05 Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Fri, 23 Jan 2026 15:45:58 +0000 Subject: [PATCH 01/17] Add PAGE_PRINTING_KAMP to PRINTING_PAGES list --- display.py | 1 + 1 file changed, 1 insertion(+) diff --git a/display.py b/display.py index def1ae3..cc2b193 100644 --- a/display.py +++ b/display.py @@ -105,6 +105,7 @@ def signal_handler(signum, frame): PRINTING_PAGES = [ PAGE_PRINTING, + PAGE_PRINTING_KAMP, PAGE_PRINTING_FILAMENT, PAGE_PRINTING_PAUSE, PAGE_PRINTING_STOP, From 5c7ceb67c17892b19cf7930304e2fd75d064bfc3 Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Fri, 23 Jan 2026 16:23:06 +0000 Subject: [PATCH 02/17] Implement rapid scan mode for bed leveling Added rapid scan mode functionality to improve bed leveling process. --- display.py | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/display.py b/display.py index cc2b193..3ff695a 100644 --- a/display.py +++ b/display.py @@ -265,6 +265,7 @@ def __init__(self, config, loop): self.bed_leveling_counts = [0, 0] self.bed_leveling_probed_count = 0 self.bed_leveling_last_position = None + self._rapid_scan_mode = False self.klipper_restart_event = asyncio.Event() @@ -2055,11 +2056,27 @@ async def handle_gcode_response(self, response): self.bed_leveling_counts = [x_count, y_count] elif response.startswith("// Adapted mesh bounds"): self.bed_leveling_probed_count = 0 - current_page = await self._get_current_page() + self.bed_leveling_last_position = None + self._rapid_scan_mode = False + self._bed_leveling_complete = False + # Reset leveling_mode for KAMP during printing (not manual full bed level) + if self.current_state in ("printing", "paused"): + self.leveling_mode = None + current_page = await self._get_current_page() if current_page != PAGE_PRINTING_KAMP: self._loop.create_task(self._navigate_to_page(PAGE_PRINTING_KAMP, clear_history=True)) + elif "Beginning rapid surface scan" in response: + # Rapid scan mode (Eddy/Cartographer) - skip per-point visualization + self._rapid_scan_mode = True + logger.info("Rapid scan mode detected - using simplified visualization") + self._loop.create_task( + self.display.update_kamp_text("Scanning bed surface...") + ) elif response.startswith("// probe at"): - current_page = await self._get_current_page() + # Skip all probe messages during rapid scan to avoid overwhelming the system + if self._rapid_scan_mode: + return + current_page = await self._get_current_page() if current_page != PAGE_PRINTING_KAMP: # We are not leveling, likely response came from manual probe e.g. from console, # Skip updating the state, otherwise it messes up bed leveling screen when printing @@ -2090,12 +2107,26 @@ async def handle_gcode_response(self, response): ) elif response.startswith("// Mesh Bed Leveling Complete"): + # If rapid scan mode was active, draw all boxes green now + if self._rapid_scan_mode and self.bed_leveling_counts[0] > 0: + total_probes = self.bed_leveling_counts[0] * self.bed_leveling_counts[1] + for i in range(total_probes): + self._loop.create_task( + self.display.draw_kamp_box_index(i, BACKGROUND_SUCCESS, self.bed_leveling_counts) + ) + self._loop.create_task( + self.display.update_kamp_text(f"Complete ({total_probes}/{total_probes})") + ) + self.bed_leveling_probed_count = 0 self.bed_leveling_counts = self.full_bed_leveling_counts - current_page = await self._get_current_page() + self._rapid_scan_mode = False + + current_page = await self._get_current_page() if current_page == PAGE_PRINTING_KAMP: self._bed_leveling_complete = True - if self.leveling_mode == "full_bed": + # Only show bed mesh final for manual full bed leveling (not during printing) + if self.leveling_mode == "full_bed" and self.current_state not in ("printing", "paused"): self._loop.create_task(self.display.show_bed_mesh_final()) else: self._loop.create_task(self._handle_bed_leveling_complete()) From 497391fc6decd6f4e9155b97aac8c1609c6f3f83 Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Fri, 23 Jan 2026 16:42:21 +0000 Subject: [PATCH 03/17] Refactor rapid scan mode handling in display.py --- display.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/display.py b/display.py index 3ff695a..3653aaf 100644 --- a/display.py +++ b/display.py @@ -2066,9 +2066,19 @@ async def handle_gcode_response(self, response): if current_page != PAGE_PRINTING_KAMP: self._loop.create_task(self._navigate_to_page(PAGE_PRINTING_KAMP, clear_history=True)) elif "Beginning rapid surface scan" in response: - # Rapid scan mode (Eddy/Cartographer) - skip per-point visualization + # Rapid scan mode (Eddy/Cartographer/Beacon) - these probes don't send + # "Adapted mesh bounds" or "probe at" messages, so we handle everything here self._rapid_scan_mode = True + self.bed_leveling_probed_count = 0 + self.bed_leveling_last_position = None + self._bed_leveling_complete = False + # Reset leveling_mode for KAMP during printing + if self.current_state in ("printing", "paused"): + self.leveling_mode = None logger.info("Rapid scan mode detected - using simplified visualization") + current_page = await self._get_current_page() + if current_page != PAGE_PRINTING_KAMP: + self._loop.create_task(self._navigate_to_page(PAGE_PRINTING_KAMP, clear_history=True)) self._loop.create_task( self.display.update_kamp_text("Scanning bed surface...") ) @@ -2107,15 +2117,11 @@ async def handle_gcode_response(self, response): ) elif response.startswith("// Mesh Bed Leveling Complete"): - # If rapid scan mode was active, draw all boxes green now - if self._rapid_scan_mode and self.bed_leveling_counts[0] > 0: - total_probes = self.bed_leveling_counts[0] * self.bed_leveling_counts[1] - for i in range(total_probes): - self._loop.create_task( - self.display.draw_kamp_box_index(i, BACKGROUND_SUCCESS, self.bed_leveling_counts) - ) + # If rapid scan mode was active, show completion (no boxes for rapid scan probes + # since they don't report probe counts) + if self._rapid_scan_mode: self._loop.create_task( - self.display.update_kamp_text(f"Complete ({total_probes}/{total_probes})") + self.display.update_kamp_text("Scan complete!") ) self.bed_leveling_probed_count = 0 From e93846982f6007dbf25b17f782c3c05f20a09517 Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Fri, 23 Jan 2026 16:45:17 +0000 Subject: [PATCH 04/17] Refactor bed leveling completion handling --- display.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/display.py b/display.py index 3653aaf..787f78f 100644 --- a/display.py +++ b/display.py @@ -2057,8 +2057,9 @@ async def handle_gcode_response(self, response): elif response.startswith("// Adapted mesh bounds"): self.bed_leveling_probed_count = 0 self.bed_leveling_last_position = None - self._rapid_scan_mode = False self._bed_leveling_complete = False + # Don't reset _rapid_scan_mode here - it may have been set by + # "Beginning rapid surface scan" which can come before or after this message # Reset leveling_mode for KAMP during printing (not manual full bed level) if self.current_state in ("printing", "paused"): self.leveling_mode = None @@ -2117,12 +2118,24 @@ async def handle_gcode_response(self, response): ) elif response.startswith("// Mesh Bed Leveling Complete"): - # If rapid scan mode was active, show completion (no boxes for rapid scan probes - # since they don't report probe counts) + # If rapid scan mode was active, show completion + # Draw boxes if we received probe counts (some probes send both rapid scan AND counts) if self._rapid_scan_mode: - self._loop.create_task( - self.display.update_kamp_text("Scan complete!") - ) + if self.bed_leveling_counts[0] > 0: + # We have counts, draw all boxes green + total_probes = self.bed_leveling_counts[0] * self.bed_leveling_counts[1] + for i in range(total_probes): + self._loop.create_task( + self.display.draw_kamp_box_index(i, BACKGROUND_SUCCESS, self.bed_leveling_counts) + ) + self._loop.create_task( + self.display.update_kamp_text(f"Scan complete! ({total_probes} points)") + ) + else: + # No counts available, just show completion text + self._loop.create_task( + self.display.update_kamp_text("Scan complete!") + ) self.bed_leveling_probed_count = 0 self.bed_leveling_counts = self.full_bed_leveling_counts From e41eeda4a954fd5d41043a61796447e06ae0de78 Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Fri, 23 Jan 2026 17:55:36 +0000 Subject: [PATCH 05/17] Enhance rapid scan mode detection in display.py --- display.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/display.py b/display.py index 787f78f..7ac69fe 100644 --- a/display.py +++ b/display.py @@ -2066,7 +2066,7 @@ async def handle_gcode_response(self, response): current_page = await self._get_current_page() if current_page != PAGE_PRINTING_KAMP: self._loop.create_task(self._navigate_to_page(PAGE_PRINTING_KAMP, clear_history=True)) - elif "Beginning rapid surface scan" in response: + elif "Beginning rapid surface scan" in response or "[cartographer] Starting stream" in response: # Rapid scan mode (Eddy/Cartographer/Beacon) - these probes don't send # "Adapted mesh bounds" or "probe at" messages, so we handle everything here self._rapid_scan_mode = True From 6608eaa7fa6edd199a77517212826cb25063f8cd Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Fri, 23 Jan 2026 18:05:56 +0000 Subject: [PATCH 06/17] Enhance response check for bed leveling completion --- display.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/display.py b/display.py index 7ac69fe..01a17f0 100644 --- a/display.py +++ b/display.py @@ -2117,7 +2117,7 @@ async def handle_gcode_response(self, response): ) ) - elif response.startswith("// Mesh Bed Leveling Complete"): + elif response.startswith("// Mesh Bed Leveling Complete") or "[cartographer] Collecting samples along the scanning path completed" in response: # If rapid scan mode was active, show completion # Draw boxes if we received probe counts (some probes send both rapid scan AND counts) if self._rapid_scan_mode: From b56d0b513480a07beee86dadd484f3d708f38845 Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Tue, 27 Jan 2026 11:06:56 +0000 Subject: [PATCH 07/17] Fix G-code response handling for probe messages --- display.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/display.py b/display.py index 01a17f0..531a3f5 100644 --- a/display.py +++ b/display.py @@ -2028,7 +2028,7 @@ async def handle_zprobe_leveling(self): async def handle_gcode_response(self, response): if self.leveling_mode == "screw": - if "probe at" in response: + if "probe: at" in response: self.screw_probe_count += 1 self._loop.create_task( self.display.update_screw_level_description( @@ -2068,7 +2068,7 @@ async def handle_gcode_response(self, response): self._loop.create_task(self._navigate_to_page(PAGE_PRINTING_KAMP, clear_history=True)) elif "Beginning rapid surface scan" in response or "[cartographer] Starting stream" in response: # Rapid scan mode (Eddy/Cartographer/Beacon) - these probes don't send - # "Adapted mesh bounds" or "probe at" messages, so we handle everything here + # "Adapted mesh bounds" or "probe: at" messages, so we handle everything here self._rapid_scan_mode = True self.bed_leveling_probed_count = 0 self.bed_leveling_last_position = None @@ -2083,7 +2083,7 @@ async def handle_gcode_response(self, response): self._loop.create_task( self.display.update_kamp_text("Scanning bed surface...") ) - elif response.startswith("// probe at"): + elif response.startswith("// probe: at"): # Skip all probe messages during rapid scan to avoid overwhelming the system if self._rapid_scan_mode: return From 28b0070312cc93c68043e42026553b540d0bf585 Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Tue, 27 Jan 2026 11:15:50 +0000 Subject: [PATCH 08/17] Enhance affinity setup with multiprocessing detection Added auto-detection for multiprocessing plugins and enhanced logging for IRQs and multiprocessing support. --- affinity-setup.sh | 173 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 148 insertions(+), 25 deletions(-) diff --git a/affinity-setup.sh b/affinity-setup.sh index b953db4..f271a0f 100644 --- a/affinity-setup.sh +++ b/affinity-setup.sh @@ -1,5 +1,7 @@ #!/bin/sh # Klipper stack affinity + PREEMPT_RT realtime setup (no unit file edits on disk) +# Auto-detects multiprocessing plugins (Cartographer, Beacon, IDM) and handles +# child process migration + continuous thread pinning. set -eu @@ -20,12 +22,49 @@ DISPLAY_TTY_CPU=1 # display.service + ttyS1 IRQ KLIPPER_MCU_RPI_CPU=2 # klipper-mcu.service (host-MCU tasks) KLIPPER_MCU_TTY_CPU=3 # klipper.service + ttyS0 IRQ +KLIPPER_CPUS_WIDE="0,3" # widened cgroup for child migration + +# --- auto-detection of multiprocessing plugins ------------------------------- +# Scans klippy logs for evidence of loaded plugins that use multiprocessing. +# More reliable than config parsing: catches includes, conditional configs, +# and confirms the plugin actually loaded successfully. +detect_multiprocessing_plugins() { + log_paths=" + /home/*/printer_data/logs/klippy.log* + /home/*/klipper_logs/klippy.log* + " + + # Patterns indicating multiprocessing plugins loaded: + # [cartographer] - Cartographer probe (eddy current) + # [scanner] - Cartographer v5+ / Survey Touch + # [beacon] - Beacon probe (eddy current) + # [mcu eddy] - BTT Eddy MCU definition (space distinguishes from other mcus) + # btt_eddy - BTT Eddy references in config/loading + # [idm] - IDM probe + pattern='\[cartographer\]|\[scanner\]|\[beacon\]|\[mcu eddy\]|btt_eddy|\[idm\]' + + # shellcheck disable=SC2086 + for glob in $log_paths; do + for f in $glob; do + [ -f "$f" ] || continue + # Search first 5000 lines (startup/config loading section) to avoid + # scanning entire multi-MB logs; plugins appear early during init + if head -n 5000 "$f" 2>/dev/null | grep -qE "$pattern"; then + log "Detected multiprocessing plugin in $f" + return 0 + fi + done + done + + return 1 +} + # --- basic env checks (warn-only) -------------------------------------------- have() { command -v "$1" >/dev/null 2>&1; } if systemctl is-active --quiet irqbalance.service 2>/dev/null; then log "WARN: irqbalance is active; it may override IRQ affinities." fi -for bin in systemctl awk ps sed chrt taskset ionice renice stty sysctl logger; do +for bin in systemctl awk ps sed chrt taskset ionice renice stty sysctl logger grep; do have "$bin" || log "WARN: missing helper '$bin' (some steps may be skipped)." done @@ -37,7 +76,7 @@ cpu_online() { } # --- helpers ----------------------------------------------------------------- -wait_active() { # wait until systemd unit is active and has a MainPID +wait_active() { unit="$1"; t=0 while [ "$t" -lt 30 ]; do state=$(systemctl is-active "$unit" 2>/dev/null || true) @@ -52,7 +91,7 @@ wait_active() { # wait until systemd unit is active and has a MainPID return 0 } -wait_irq_present() { # wait until /proc/interrupts shows the device name +wait_irq_present() { dev="$1"; t=0 while [ "$t" -lt 20 ]; do irq=$(awk -v n="$dev" '$NF==n{gsub(":", "", $1); print $1; exit}' /proc/interrupts) @@ -78,7 +117,6 @@ pin_irq() { if [ -w "$p/smp_affinity_list" ]; then echo "$cpu" > "$p/smp_affinity_list" 2>/dev/null || true else - # list path preferred; mask unsafe for cpu>=32 printf '%x\n' "$((1< "$p/smp_affinity" 2>/dev/null || true fi log "Pinned IRQ $irq to CPU $cpu" @@ -86,15 +124,21 @@ pin_irq() { set_unit_cpus() { unit="$1"; cpus="$2" - cpu_online "$cpus" || { log "WARN: CPU $cpus not online; skip $unit CPU pin"; return 0; } + case "$cpus" in + *,*|*-*) ;; + *) cpu_online "$cpus" || { log "WARN: CPU $cpus not online; skip $unit CPU pin"; return 0; } ;; + esac if systemctl set-property --runtime "$unit" "AllowedCPUs=$cpus" >/dev/null 2>&1; then log "AllowedCPUs for $unit -> $cpus" + return 0 else pid=$(systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0) if [ "${pid:-0}" -gt 0 ] && taskset -pc "$cpus" "$pid" >/dev/null 2>&1; then log "taskset fallback for $unit(pid=$pid) -> $cpus" + return 0 fi fi + return 1 } renice_unit() { @@ -120,7 +164,6 @@ ps_line() { ps -o pid,cls,rtprio,psr,cmd -p "$pid" --no-headers 2>/dev/null | sed 's/^/ /' } -# Ensure a unit runs as SCHED_FIFO:prio even if the process set its own (-r/49) promote_unit_fifo() { unit="$1"; prio="$2" systemctl set-property --runtime "$unit" CPUSchedulingPolicy=fifo CPUSchedulingPriority="$prio" >/dev/null 2>&1 || true @@ -142,8 +185,6 @@ promote_unit_fifo() { return 0 } -# Promote a threaded IRQ kernel thread (irq/-*) to FIFO priority -# Handles kernel threads displayed as "[irq/-...]" by stripping brackets. chrt_irq_thread() { irq="$1"; prio="$2"; t=0 [ -n "$irq" ] || return 0 @@ -168,11 +209,99 @@ chrt_irq_thread() { return 0 } +# --- multiprocessing support ------------------------------------------------- +# Widens klipper cgroup AFTER RT promotion, immediately pins all threads, +# then starts monitor for ongoing thread/child management. + +widen_and_monitor_klipper() { + main_pid=$(systemctl show -p MainPID --value klipper.service 2>/dev/null || echo 0) + [ "${main_pid:-0}" -gt 0 ] || return 0 + + children_file="/proc/$main_pid/task/$main_pid/children" + [ -f "$children_file" ] || { + log "WARN: /proc children file unavailable (CONFIG_PROC_CHILDREN=n?); child monitor disabled" + # Continue anyway - thread monitoring still useful + } + + # Widen cgroup to allow children on CPU 0 + if ! systemctl set-property --runtime klipper.service "AllowedCPUs=$KLIPPER_CPUS_WIDE" >/dev/null 2>&1; then + log "WARN: could not widen klipper cgroup; multiprocessing children will run on CPU $KLIPPER_MCU_TTY_CPU" + return 0 + fi + + # Immediately pin all existing threads to CPU 3 (no race window) + taskset -apc "$KLIPPER_MCU_TTY_CPU" "$main_pid" >/dev/null 2>&1 || true + log "Widened klipper cgroup to $KLIPPER_CPUS_WIDE, pinned existing threads to CPU $KLIPPER_MCU_TTY_CPU" + + # Start combined thread + child monitor + log "Starting klipper thread/child monitor for pid $main_pid" + + ( + seen_children="" + target_mask=$(printf '%x' $((1 << KLIPPER_MCU_TTY_CPU))) + + while kill -0 "$main_pid" 2>/dev/null; do + # --- Pin any new/migrated threads in main process to CPU 3 --- + for tid_path in /proc/"$main_pid"/task/*; do + tid="${tid_path##*/}" + [ -d "$tid_path" ] || continue + + # Check current affinity - only touch if not already correct + current=$(cat "$tid_path/status" 2>/dev/null | awk '/^Cpus_allowed:/{print $2}' || echo "") + if [ -n "$current" ] && [ "$current" != "$target_mask" ]; then + taskset -pc "$KLIPPER_MCU_TTY_CPU" "$tid" >/dev/null 2>&1 || true + fi + done + + # --- Migrate child processes to CPU 0 --- + if [ -f "$children_file" ]; then + for cpid in $(cat "$children_file" 2>/dev/null); do + case " $seen_children " in + *" $cpid "*) continue ;; + esac + + if [ -d "/proc/$cpid" ]; then + tgid=$(awk '/^Tgid:/{print $2}' "/proc/$cpid/status" 2>/dev/null || echo "") + if [ "$tgid" = "$cpid" ]; then + taskset -apc "$MISC_CPU" "$cpid" >/dev/null 2>&1 && \ + logger -t "$TAG" "klipper child $cpid -> CPU $MISC_CPU" + renice -n 15 -p "$cpid" >/dev/null 2>&1 || true + ionice -c3 -p "$cpid" >/dev/null 2>&1 || true + seen_children="$seen_children $cpid" + fi + fi + done + fi + + # 250ms poll: balance between responsiveness and overhead + sleep 0.25 + done + + logger -t "$TAG" "klipper monitor exiting (main process gone)" + ) & + + monitor_pid=$! + taskset -pc "$MISC_CPU" "$monitor_pid" >/dev/null 2>&1 || true + renice -n 19 -p "$monitor_pid" >/dev/null 2>&1 || true + ionice -c3 -p "$monitor_pid" >/dev/null 2>&1 || true + + log "Thread/child monitor running as pid $monitor_pid on CPU $MISC_CPU" +} + # --- wait for services ------------------------------------------------------- for u in klipper.service klipper-mcu.service moonraker.service display.service; do wait_active "$u" done +# --- auto-detect multiprocessing plugins ------------------------------------- +MULTIPROCESSING_DETECTED=0 +if detect_multiprocessing_plugins; then + MULTIPROCESSING_DETECTED=1 + log "Multiprocessing plugin support enabled" +else + log "No multiprocessing plugins detected; using standard pinning" +fi + # --- wait for UART IRQs ------------------------------------------------------ wait_irq_present ttyS0 wait_irq_present ttyS1 @@ -187,7 +316,7 @@ IRQ_S2="$(irq_for ttyS2 || true)" [ -n "${IRQ_S1:-}" ] && pin_irq "$IRQ_S1" "$DISPLAY_TTY_CPU" [ -n "${IRQ_S2:-}" ] && pin_irq "$IRQ_S2" "$MISC_CPU" -# --- place units on CPUs ----------------------------------------------------- +# --- place units on CPUs (klipper gets single CPU initially) ----------------- set_unit_cpus klipper-mcu.service "$KLIPPER_MCU_RPI_CPU" set_unit_cpus klipper.service "$KLIPPER_MCU_TTY_CPU" set_unit_cpus display.service "$DISPLAY_TTY_CPU" @@ -219,6 +348,12 @@ sysctl -w kernel.sched_rt_runtime_us=-1 >/dev/null 2>&1 || \ promote_unit_fifo klipper-mcu.service 60 promote_unit_fifo klipper.service 60 +# --- multiprocessing support (only if plugin detected) ----------------------- +# Done AFTER RT promotion to avoid race window +if [ "$MULTIPROCESSING_DETECTED" = "1" ]; then + widen_and_monitor_klipper +fi + # --- bump ttyS0 IRQ thread if present ---------------------------------------- if [ -n "${IRQ_S0:-}" ]; then chrt_irq_thread "$IRQ_S0" 70 @@ -226,31 +361,19 @@ fi # --- summary ----------------------------------------------------------------- if [ -n "${IRQ_S0:-}" ]; then - aff0=$( - cat "/proc/irq/$IRQ_S0/smp_affinity_list" 2>/dev/null || - cat "/proc/irq/$IRQ_S0/smp_affinity" 2>/dev/null || - echo "?" - ) + aff0=$(cat "/proc/irq/$IRQ_S0/smp_affinity_list" 2>/dev/null || echo "?") log "ttyS0 irq=$IRQ_S0 aff=$aff0" fi if [ -n "${IRQ_S1:-}" ]; then - aff1=$( - cat "/proc/irq/$IRQ_S1/smp_affinity_list" 2>/dev/null || - cat "/proc/irq/$IRQ_S1/smp_affinity" 2>/dev/null || - echo "?" - ) + aff1=$(cat "/proc/irq/$IRQ_S1/smp_affinity_list" 2>/dev/null || echo "?") log "ttyS1 irq=$IRQ_S1 aff=$aff1" fi if [ -n "${IRQ_S2:-}" ]; then - aff2=$( - cat "/proc/irq/$IRQ_S2/smp_affinity_list" 2>/dev/null || - cat "/proc/irq/$IRQ_S2/smp_affinity" 2>/dev/null || - echo "?" - ) + aff2=$(cat "/proc/irq/$IRQ_S2/smp_affinity_list" 2>/dev/null || echo "?") log "ttyS2 irq=$IRQ_S2 aff=$aff2" fi log "klipper-mcu:$(ps_line "$(systemctl show -p MainPID --value klipper-mcu.service 2>/dev/null || echo 0)" || true)" log "klipper: $(ps_line "$(systemctl show -p MainPID --value klipper.service 2>/dev/null || echo 0)" || true)" -log "done (IRQs: ttyS0:${IRQ_S0:-?} ttyS1:${IRQ_S1:-?} ttyS2:${IRQ_S2:-?})" +log "done (IRQs: ttyS0:${IRQ_S0:-?} ttyS1:${IRQ_S1:-?} ttyS2:${IRQ_S2:-?}, multiprocessing=$MULTIPROCESSING_DETECTED)" exit 0 From 722542a53743fecf7c2d28702788e35670dd544f Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:26:28 +0000 Subject: [PATCH 09/17] Refactor affinity-setup.sh for improved performance Enhance affinity setup script with improved error handling, faster polling, and better CPU validation. Update child process management and logging for multiprocessing plugins. --- affinity-setup.sh | 254 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 177 insertions(+), 77 deletions(-) diff --git a/affinity-setup.sh b/affinity-setup.sh index f271a0f..36cc68e 100644 --- a/affinity-setup.sh +++ b/affinity-setup.sh @@ -2,6 +2,13 @@ # Klipper stack affinity + PREEMPT_RT realtime setup (no unit file edits on disk) # Auto-detects multiprocessing plugins (Cartographer, Beacon, IDM) and handles # child process migration + continuous thread pinning. +# +# Fixes applied: +# - Child processes are demoted from inherited SCHED_FIFO to SCHED_OTHER +# - Cpus_allowed comparison handles leading zeros correctly +# - Faster 100ms polling for quicker child detection +# - Reduced thread re-pinning overhead (only on affinity drift) +# - Proper error handling throughout set -eu @@ -24,47 +31,41 @@ KLIPPER_MCU_TTY_CPU=3 # klipper.service + ttyS0 IRQ KLIPPER_CPUS_WIDE="0,3" # widened cgroup for child migration +# Monitor polling interval in seconds (100ms for responsive child detection) +MONITOR_POLL_INTERVAL="0.1" + # --- auto-detection of multiprocessing plugins ------------------------------- -# Scans klippy logs for evidence of loaded plugins that use multiprocessing. -# More reliable than config parsing: catches includes, conditional configs, -# and confirms the plugin actually loaded successfully. detect_multiprocessing_plugins() { log_paths=" /home/*/printer_data/logs/klippy.log* /home/*/klipper_logs/klippy.log* " - - # Patterns indicating multiprocessing plugins loaded: - # [cartographer] - Cartographer probe (eddy current) - # [scanner] - Cartographer v5+ / Survey Touch - # [beacon] - Beacon probe (eddy current) - # [mcu eddy] - BTT Eddy MCU definition (space distinguishes from other mcus) - # btt_eddy - BTT Eddy references in config/loading - # [idm] - IDM probe + + # Patterns indicating multiprocessing plugins loaded pattern='\[cartographer\]|\[scanner\]|\[beacon\]|\[mcu eddy\]|btt_eddy|\[idm\]' - + # shellcheck disable=SC2086 for glob in $log_paths; do for f in $glob; do [ -f "$f" ] || continue - # Search first 5000 lines (startup/config loading section) to avoid - # scanning entire multi-MB logs; plugins appear early during init if head -n 5000 "$f" 2>/dev/null | grep -qE "$pattern"; then log "Detected multiprocessing plugin in $f" return 0 fi done done - + return 1 } # --- basic env checks (warn-only) -------------------------------------------- have() { command -v "$1" >/dev/null 2>&1; } + if systemctl is-active --quiet irqbalance.service 2>/dev/null; then log "WARN: irqbalance is active; it may override IRQ affinities." fi -for bin in systemctl awk ps sed chrt taskset ionice renice stty sysctl logger grep; do + +for bin in systemctl awk ps sed chrt taskset ionice renice stty sysctl logger grep pgrep; do have "$bin" || log "WARN: missing helper '$bin' (some steps may be skipped)." done @@ -75,6 +76,25 @@ cpu_online() { [ ! -f "$onf" ] || [ "$(cat "$onf" 2>/dev/null || echo 1)" = "1" ] } +# Validate CPU list (handles single CPU, ranges, and lists) +validate_cpu_list() { + cpus="$1" + case "$cpus" in + *,*) + # Comma-separated list: validate each part + for part in $(echo "$cpus" | tr ',' ' '); do + case "$part" in + *-*) ;; # Range - assume valid + *) cpu_online "$part" || return 1 ;; + esac + done + ;; + *-*) ;; # Range - assume valid + *) cpu_online "$cpus" || return 1 ;; + esac + return 0 +} + # --- helpers ----------------------------------------------------------------- wait_active() { unit="$1"; t=0 @@ -124,20 +144,22 @@ pin_irq() { set_unit_cpus() { unit="$1"; cpus="$2" - case "$cpus" in - *,*|*-*) ;; - *) cpu_online "$cpus" || { log "WARN: CPU $cpus not online; skip $unit CPU pin"; return 0; } ;; - esac + + validate_cpu_list "$cpus" || { log "WARN: CPU(s) $cpus not valid; skip $unit CPU pin"; return 0; } + if systemctl set-property --runtime "$unit" "AllowedCPUs=$cpus" >/dev/null 2>&1; then log "AllowedCPUs for $unit -> $cpus" return 0 - else - pid=$(systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0) - if [ "${pid:-0}" -gt 0 ] && taskset -pc "$cpus" "$pid" >/dev/null 2>&1; then - log "taskset fallback for $unit(pid=$pid) -> $cpus" - return 0 - fi fi + + # Fallback to taskset + pid=$(systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0) + if [ "${pid:-0}" -gt 0 ] && taskset -pc "$cpus" "$pid" >/dev/null 2>&1; then + log "taskset fallback for $unit(pid=$pid) -> $cpus" + return 0 + fi + + log "WARN: could not set CPUs for $unit" return 1 } @@ -161,7 +183,8 @@ ionice_idle_unit() { ps_line() { pid="$1" - ps -o pid,cls,rtprio,psr,cmd -p "$pid" --no-headers 2>/dev/null | sed 's/^/ /' + [ "${pid:-0}" -gt 0 ] || return 0 + ps -o pid,cls,rtprio,psr,cmd -p "$pid" --no-headers 2>/dev/null | sed 's/^/ /' || true } promote_unit_fifo() { @@ -169,6 +192,7 @@ promote_unit_fifo() { systemctl set-property --runtime "$unit" CPUSchedulingPolicy=fifo CPUSchedulingPriority="$prio" >/dev/null 2>&1 || true pid=$(systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0) [ "${pid:-0}" -gt 0 ] || return 0 + n=0 while [ "$n" -lt 10 ]; do chrt -a -f -p "$prio" "$pid" >/dev/null 2>&1 || true @@ -210,81 +234,157 @@ chrt_irq_thread() { } # --- multiprocessing support ------------------------------------------------- -# Widens klipper cgroup AFTER RT promotion, immediately pins all threads, -# then starts monitor for ongoing thread/child management. +MONITOR_PID_FILE="/run/klipper-affinity-monitor.pid" + +kill_existing_monitor() { + if [ -f "$MONITOR_PID_FILE" ]; then + old_pid=$(cat "$MONITOR_PID_FILE" 2>/dev/null || echo "") + if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then + log "Stopping existing monitor (pid=$old_pid)" + kill "$old_pid" 2>/dev/null || true + sleep 0.2 + fi + rm -f "$MONITOR_PID_FILE" 2>/dev/null || true + fi +} widen_and_monitor_klipper() { main_pid=$(systemctl show -p MainPID --value klipper.service 2>/dev/null || echo 0) [ "${main_pid:-0}" -gt 0 ] || return 0 - + + # Kill any existing monitor before starting a new one + kill_existing_monitor + children_file="/proc/$main_pid/task/$main_pid/children" - [ -f "$children_file" ] || { - log "WARN: /proc children file unavailable (CONFIG_PROC_CHILDREN=n?); child monitor disabled" - # Continue anyway - thread monitoring still useful - } - + has_children_file=0 + [ -f "$children_file" ] && has_children_file=1 + + if [ "$has_children_file" = "0" ]; then + log "WARN: /proc children file unavailable (CONFIG_PROC_CHILDREN=n?); using fallback child detection" + fi + # Widen cgroup to allow children on CPU 0 if ! systemctl set-property --runtime klipper.service "AllowedCPUs=$KLIPPER_CPUS_WIDE" >/dev/null 2>&1; then log "WARN: could not widen klipper cgroup; multiprocessing children will run on CPU $KLIPPER_MCU_TTY_CPU" return 0 fi - + # Immediately pin all existing threads to CPU 3 (no race window) taskset -apc "$KLIPPER_MCU_TTY_CPU" "$main_pid" >/dev/null 2>&1 || true log "Widened klipper cgroup to $KLIPPER_CPUS_WIDE, pinned existing threads to CPU $KLIPPER_MCU_TTY_CPU" - - # Start combined thread + child monitor - log "Starting klipper thread/child monitor for pid $main_pid" - + + # Compute expected affinity mask for CPU 3 (used for comparison) + # taskset -p returns hex like "8" for CPU 3 + target_mask=$(printf '%x' $((1 << KLIPPER_MCU_TTY_CPU))) + + log "Starting klipper thread/child monitor for pid $main_pid (poll=${MONITOR_POLL_INTERVAL}s)" + ( + # Write our PID to the file so we can be killed on re-run + echo $$ > "$MONITOR_PID_FILE" 2>/dev/null || true + + # Cleanup PID file on exit + trap 'rm -f "$MONITOR_PID_FILE" 2>/dev/null; exit 0' INT TERM EXIT + seen_children="" - target_mask=$(printf '%x' $((1 << KLIPPER_MCU_TTY_CPU))) - + thread_check_interval=10 # Only re-check thread affinity every 10 iterations (1 second) + iteration=0 + while kill -0 "$main_pid" 2>/dev/null; do - # --- Pin any new/migrated threads in main process to CPU 3 --- - for tid_path in /proc/"$main_pid"/task/*; do - tid="${tid_path##*/}" - [ -d "$tid_path" ] || continue - - # Check current affinity - only touch if not already correct - current=$(cat "$tid_path/status" 2>/dev/null | awk '/^Cpus_allowed:/{print $2}' || echo "") - if [ -n "$current" ] && [ "$current" != "$target_mask" ]; then - taskset -pc "$KLIPPER_MCU_TTY_CPU" "$tid" >/dev/null 2>&1 || true - fi - done - - # --- Migrate child processes to CPU 0 --- - if [ -f "$children_file" ]; then - for cpid in $(cat "$children_file" 2>/dev/null); do - case " $seen_children " in - *" $cpid "*) continue ;; - esac - - if [ -d "/proc/$cpid" ]; then - tgid=$(awk '/^Tgid:/{print $2}' "/proc/$cpid/status" 2>/dev/null || echo "") - if [ "$tgid" = "$cpid" ]; then - taskset -apc "$MISC_CPU" "$cpid" >/dev/null 2>&1 && \ - logger -t "$TAG" "klipper child $cpid -> CPU $MISC_CPU" - renice -n 15 -p "$cpid" >/dev/null 2>&1 || true - ionice -c3 -p "$cpid" >/dev/null 2>&1 || true - seen_children="$seen_children $cpid" + iteration=$((iteration + 1)) + + # --- Pin new/migrated threads in main process to CPU 3 --- + # Only check periodically to reduce overhead (threads don't drift often) + if [ "$((iteration % thread_check_interval))" = "0" ]; then + for tid_path in /proc/"$main_pid"/task/*; do + tid="${tid_path##*/}" + [ -d "$tid_path" ] || continue + [ "$tid" != "$main_pid" ] || continue # Skip main thread (already pinned) + + # Get current affinity via taskset (more reliable than parsing /proc) + # Output format: "pid 1234's current affinity mask: 8" + current=$(taskset -p "$tid" 2>/dev/null | sed 's/.*: //' || echo "") + + # Compare masks - only re-pin if different + if [ -n "$current" ] && [ "$current" != "$target_mask" ]; then + taskset -pc "$KLIPPER_MCU_TTY_CPU" "$tid" >/dev/null 2>&1 || true + fi + done + fi + + # --- Migrate and demote child processes --- + child_pids="" + + if [ "$has_children_file" = "1" ]; then + # Fast path: read from children file + child_pids=$(cat "$children_file" 2>/dev/null || true) + else + # Fallback: find processes whose parent is klipper + child_pids=$(pgrep -P "$main_pid" 2>/dev/null || true) + fi + + for cpid in $child_pids; do + [ -n "$cpid" ] || continue + + # Skip already-seen children + case " $seen_children " in + *" $cpid "*) continue ;; + esac + + # Verify it's a real child process (not a thread) + if [ -d "/proc/$cpid" ]; then + # Read Tgid and PPid in one pass for efficiency + proc_info=$(awk '/^Tgid:|^PPid:/{printf "%s ", $2}' "/proc/$cpid/status" 2>/dev/null || echo "") + tgid=$(echo "$proc_info" | awk '{print $1}') + ppid=$(echo "$proc_info" | awk '{print $2}') + + # Must be a process leader (tgid == pid) and child of klipper + if [ "$tgid" = "$cpid" ] && [ "$ppid" = "$main_pid" ]; then + + # CRITICAL: First demote from inherited SCHED_FIFO to SCHED_OTHER + # This MUST happen before renice (renice has no effect on RT processes) + if chrt -o -p 0 "$cpid" >/dev/null 2>&1; then + logger -t "$TAG" "klipper child $cpid demoted to SCHED_OTHER" + else + logger -t "$TAG" "WARN: failed to demote child $cpid from SCHED_FIFO" + fi + + # Now migrate to MISC_CPU + if taskset -apc "$MISC_CPU" "$cpid" >/dev/null 2>&1; then + logger -t "$TAG" "klipper child $cpid -> CPU $MISC_CPU" fi + + # Apply nice and ionice (now that it's SCHED_OTHER, these work) + renice -n 15 -p "$cpid" >/dev/null 2>&1 || true + ionice -c2 -n 7 -p "$cpid" >/dev/null 2>&1 || true # best-effort class, low priority + + seen_children="$seen_children $cpid" fi + fi + done + + # Clean up seen_children list (remove dead processes to prevent unbounded growth) + if [ "$((iteration % 100))" = "0" ]; then + new_seen="" + for cpid in $seen_children; do + [ -d "/proc/$cpid" ] && new_seen="$new_seen $cpid" done + seen_children="$new_seen" fi - - # 250ms poll: balance between responsiveness and overhead - sleep 0.25 + + sleep "$MONITOR_POLL_INTERVAL" done - + logger -t "$TAG" "klipper monitor exiting (main process gone)" ) & - + monitor_pid=$! + + # Pin monitor to MISC_CPU with lowest priority taskset -pc "$MISC_CPU" "$monitor_pid" >/dev/null 2>&1 || true renice -n 19 -p "$monitor_pid" >/dev/null 2>&1 || true ionice -c3 -p "$monitor_pid" >/dev/null 2>&1 || true - + log "Thread/child monitor running as pid $monitor_pid on CPU $MISC_CPU" } @@ -373,7 +473,7 @@ if [ -n "${IRQ_S2:-}" ]; then log "ttyS2 irq=$IRQ_S2 aff=$aff2" fi -log "klipper-mcu:$(ps_line "$(systemctl show -p MainPID --value klipper-mcu.service 2>/dev/null || echo 0)" || true)" -log "klipper: $(ps_line "$(systemctl show -p MainPID --value klipper.service 2>/dev/null || echo 0)" || true)" +log "klipper-mcu:$(ps_line "$(systemctl show -p MainPID --value klipper-mcu.service 2>/dev/null || echo 0)")" +log "klipper: $(ps_line "$(systemctl show -p MainPID --value klipper.service 2>/dev/null || echo 0)")" log "done (IRQs: ttyS0:${IRQ_S0:-?} ttyS1:${IRQ_S1:-?} ttyS2:${IRQ_S2:-?}, multiprocessing=$MULTIPROCESSING_DETECTED)" exit 0 From dc5b70be3266aeb3bc66bd4889cd78a158e60148 Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Fri, 30 Jan 2026 14:43:51 +0000 Subject: [PATCH 10/17] Update affinity-setup.sh Refactor & simplified affinity setup script for low-jitter tuning and more relaxed CPU management. --- affinity-setup.sh | 527 +++++++++++----------------------------------- 1 file changed, 126 insertions(+), 401 deletions(-) diff --git a/affinity-setup.sh b/affinity-setup.sh index 36cc68e..be71b53 100644 --- a/affinity-setup.sh +++ b/affinity-setup.sh @@ -1,479 +1,204 @@ #!/bin/sh -# Klipper stack affinity + PREEMPT_RT realtime setup (no unit file edits on disk) -# Auto-detects multiprocessing plugins (Cartographer, Beacon, IDM) and handles -# child process migration + continuous thread pinning. -# -# Fixes applied: -# - Child processes are demoted from inherited SCHED_FIFO to SCHED_OTHER -# - Cpus_allowed comparison handles leading zeros correctly -# - Faster 100ms polling for quicker child detection -# - Reduced thread re-pinning overhead (only on affinity drift) -# - Proper error handling throughout +# Klipper stack affinity + low-jitter tuning (MCU-safe, PREEMPT_RT aware) +# Implements: CPU governor lock, irqbalance neutralization, polling IRQ discovery +# Avoids: FIFO promotion, RT throttle disable, kernel isolation set -eu -TAG="affinity-setup" +TAG="klipper-affinity-safe" log() { logger -t "$TAG" -- "$@" printf '%s: %s\n' "$TAG" "$*" } -# Re-exec as root if needed +# ---------------- Re-exec as root ---------------- if [ "$(id -u)" != 0 ]; then exec sudo -E -- "$0" "$@" fi -# --- CPU layout (0-based) ---------------------------------------------------- -MISC_CPU=0 # moonraker/mobileraker/mjpg-streamer/power_monitor + ttyS2 IRQ -DISPLAY_TTY_CPU=1 # display.service + ttyS1 IRQ -KLIPPER_MCU_RPI_CPU=2 # klipper-mcu.service (host-MCU tasks) -KLIPPER_MCU_TTY_CPU=3 # klipper.service + ttyS0 IRQ - -KLIPPER_CPUS_WIDE="0,3" # widened cgroup for child migration - -# Monitor polling interval in seconds (100ms for responsive child detection) -MONITOR_POLL_INTERVAL="0.1" - -# --- auto-detection of multiprocessing plugins ------------------------------- -detect_multiprocessing_plugins() { - log_paths=" - /home/*/printer_data/logs/klippy.log* - /home/*/klipper_logs/klippy.log* - " - - # Patterns indicating multiprocessing plugins loaded - pattern='\[cartographer\]|\[scanner\]|\[beacon\]|\[mcu eddy\]|btt_eddy|\[idm\]' - - # shellcheck disable=SC2086 - for glob in $log_paths; do - for f in $glob; do - [ -f "$f" ] || continue - if head -n 5000 "$f" 2>/dev/null | grep -qE "$pattern"; then - log "Detected multiprocessing plugin in $f" - return 0 - fi - done - done +# ---------------- CPU layout (0-based) ---------------- +MISC_CPU=0 # moonraker, webcam, power monitor, ttyS2 +DISPLAY_CPU=1 # display.service, ttyS1 +KLIPPER_CPU=2 # klipper.service + ttyS0 IRQ +KLIPPER_MCU_CPU=3 # klipper-mcu.service - return 1 -} +# ---- Optional: co-locate klipper-mcu with klipper (cache locality vs contention) +# KLIPPER_MCU_CPU=$KLIPPER_CPU -# --- basic env checks (warn-only) -------------------------------------------- -have() { command -v "$1" >/dev/null 2>&1; } +# ---- Optional: demote FIFO processes to CFS before renicing +# Set to "yes" to force CFS scheduling (safer), "no" to leave existing scheduler +DEMOTE_FIFO_TO_CFS="no" -if systemctl is-active --quiet irqbalance.service 2>/dev/null; then - log "WARN: irqbalance is active; it may override IRQ affinities." -fi +# ---------------- Helpers ---------------- +have() { command -v "$1" >/dev/null 2>&1; } -for bin in systemctl awk ps sed chrt taskset ionice renice stty sysctl logger grep pgrep; do - have "$bin" || log "WARN: missing helper '$bin' (some steps may be skipped)." +for bin in systemctl awk ps sed taskset ionice renice stty chrt cpupower logger; do + have "$bin" || log "WARN: missing helper '$bin'" done cpu_online() { c="$1" [ -d "/sys/devices/system/cpu/cpu$c" ] || return 1 onf="/sys/devices/system/cpu/cpu$c/online" - [ ! -f "$onf" ] || [ "$(cat "$onf" 2>/dev/null || echo 1)" = "1" ] + [ ! -f "$onf" ] || [ "$(cat "$onf")" = "1" ] } -# Validate CPU list (handles single CPU, ranges, and lists) -validate_cpu_list() { - cpus="$1" - case "$cpus" in - *,*) - # Comma-separated list: validate each part - for part in $(echo "$cpus" | tr ',' ' '); do - case "$part" in - *-*) ;; # Range - assume valid - *) cpu_online "$part" || return 1 ;; - esac - done - ;; - *-*) ;; # Range - assume valid - *) cpu_online "$cpus" || return 1 ;; - esac - return 0 -} - -# --- helpers ----------------------------------------------------------------- wait_active() { unit="$1"; t=0 while [ "$t" -lt 30 ]; do - state=$(systemctl is-active "$unit" 2>/dev/null || true) pid=$(systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0) - if [ "$state" = "active" ] && [ "${pid:-0}" -gt 0 ]; then - return 0 - fi - sleep 0.5 - t=$((t+1)) - done - log "WARN: $unit did not become active in time; continuing." - return 0 -} - -wait_irq_present() { - dev="$1"; t=0 - while [ "$t" -lt 20 ]; do - irq=$(awk -v n="$dev" '$NF==n{gsub(":", "", $1); print $1; exit}' /proc/interrupts) - [ -n "$irq" ] && return 0 + [ "$pid" -gt 0 ] && return 0 sleep 0.5 t=$((t+1)) done - log "WARN: IRQ for $dev not found; continuing." - return 0 + log "WARN: $unit did not become active" } irq_for() { awk -v name="$1" '$NF==name{gsub(":", "", $1); print $1; exit}' /proc/interrupts } +# Poll for IRQ to appear (handles lazy IRQ registration) +poll_irq() { + dev="$1"; timeout="${2:-10}"; t=0 + irq="" + while [ "$t" -lt "$timeout" ]; do + irq="$(irq_for "$dev" || true)" + [ -n "$irq" ] && break + sleep 1 + t=$((t+1)) + done + echo "$irq" +} + pin_irq() { irq="$1"; cpu="$2" [ -n "$irq" ] || return 0 - p="/proc/irq/$irq" - [ -d "$p" ] || return 0 - cpu_online "$cpu" || { log "WARN: CPU $cpu not online; skip pin IRQ $irq"; return 0; } - - if [ -w "$p/smp_affinity_list" ]; then - echo "$cpu" > "$p/smp_affinity_list" 2>/dev/null || true - else - printf '%x\n' "$((1< "$p/smp_affinity" 2>/dev/null || true - fi - log "Pinned IRQ $irq to CPU $cpu" + cpu_online "$cpu" || return 0 + echo "$cpu" > "/proc/irq/$irq/smp_affinity_list" 2>/dev/null || true + log "Pinned IRQ $irq -> CPU $cpu" } set_unit_cpus() { - unit="$1"; cpus="$2" - - validate_cpu_list "$cpus" || { log "WARN: CPU(s) $cpus not valid; skip $unit CPU pin"; return 0; } - - if systemctl set-property --runtime "$unit" "AllowedCPUs=$cpus" >/dev/null 2>&1; then - log "AllowedCPUs for $unit -> $cpus" - return 0 + unit="$1"; cpu="$2" + cpu_online "$cpu" || return 0 + if ! systemctl set-property --runtime "$unit" AllowedCPUs="$cpu" >/dev/null 2>&1; then + pid=$(systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0) + [ "$pid" -gt 0 ] && taskset -pc "$cpu" "$pid" >/dev/null 2>&1 || true fi + log "Pinned $unit -> CPU $cpu" +} - # Fallback to taskset +# Demote RT (FIFO/RR) to CFS if enabled +demote_to_cfs() { + unit="$1" + [ "$DEMOTE_FIFO_TO_CFS" = "yes" ] || return 0 pid=$(systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0) - if [ "${pid:-0}" -gt 0 ] && taskset -pc "$cpus" "$pid" >/dev/null 2>&1; then - log "taskset fallback for $unit(pid=$pid) -> $cpus" - return 0 - fi - - log "WARN: could not set CPUs for $unit" - return 1 + [ "$pid" -gt 0 ] || return 0 + # Check current scheduling class + cls=$(ps -o cls= -p "$pid" 2>/dev/null || echo "TS") + case "$cls" in + FF|RR) + chrt -o 0 -p "$pid" 2>/dev/null && log "Demoted $unit from $cls to CFS" || true + ;; + esac } renice_unit() { - unit="$1"; niceval="$2" + unit="$1"; nice="$2" pid=$(systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0) - if [ "${pid:-0}" -gt 0 ] && renice -n "$niceval" -p "$pid" >/dev/null 2>&1; then - log "renice $unit(pid=$pid) -> $niceval" - fi - return 0 + [ "$pid" -gt 0 ] && renice "$nice" -p "$pid" >/dev/null 2>&1 || true } ionice_idle_unit() { unit="$1" pid=$(systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0) - if [ "${pid:-0}" -gt 0 ] && ionice -c3 -p "$pid" >/dev/null 2>&1; then - log "ionice idle $unit(pid=$pid)" - fi - return 0 + [ "$pid" -gt 0 ] && ionice -c3 -p "$pid" >/dev/null 2>&1 || true } ps_line() { pid="$1" - [ "${pid:-0}" -gt 0 ] || return 0 - ps -o pid,cls,rtprio,psr,cmd -p "$pid" --no-headers 2>/dev/null | sed 's/^/ /' || true -} - -promote_unit_fifo() { - unit="$1"; prio="$2" - systemctl set-property --runtime "$unit" CPUSchedulingPolicy=fifo CPUSchedulingPriority="$prio" >/dev/null 2>&1 || true - pid=$(systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0) - [ "${pid:-0}" -gt 0 ] || return 0 - - n=0 - while [ "$n" -lt 10 ]; do - chrt -a -f -p "$prio" "$pid" >/dev/null 2>&1 || true - cls=$(ps -o cls= -p "$pid" 2>/dev/null | xargs || true) - rt=$(ps -o rtprio= -p "$pid" 2>/dev/null | xargs || true) - if [ "$cls" = "FF" ] && [ "$rt" = "$prio" ]; then - log "$unit(pid=$pid) -> FIFO $prio (all threads)" - return 0 - fi - sleep 0.3 - n=$((n+1)) - done - log "WARN: $unit(pid=$pid) did not reach FIFO $prio (last: cls=$cls rtprio=$rt)" - return 0 + ps -o pid,cls,rtprio,ni,psr,cmd -p "$pid" --no-headers 2>/dev/null | sed 's/^/ /' } -chrt_irq_thread() { - irq="$1"; prio="$2"; t=0 - [ -n "$irq" ] || return 0 - while [ "$t" -lt 20 ]; do - pid=$( - ps -eLo pid=,cmd= 2>/dev/null | awk -v irq="$irq" ' - { - pid=$1; $1=""; sub(/^[ \t]+/,""); name=$0; - gsub(/^\[/,"",name); gsub(/\]$/,"",name); - if (name ~ ("^irq/" irq "-")) { print pid; exit } - }' - ) - if [ -n "$pid" ]; then - chrt -f -p "$prio" "$pid" >/dev/null 2>&1 || true - log "IRQ thread irq/$irq -> FIFO $prio (pid=$pid)" - return 0 - fi - sleep 0.5 - t=$((t+1)) +# ---------------- CPU governor: performance ---------------- +if have cpupower; then + cpupower frequency-set -g performance >/dev/null 2>&1 || true +else + for g in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do + [ -w "$g" ] && echo performance > "$g" 2>/dev/null || true done - log "WARN: did not find threaded IRQ for $irq" - return 0 -} - -# --- multiprocessing support ------------------------------------------------- -MONITOR_PID_FILE="/run/klipper-affinity-monitor.pid" - -kill_existing_monitor() { - if [ -f "$MONITOR_PID_FILE" ]; then - old_pid=$(cat "$MONITOR_PID_FILE" 2>/dev/null || echo "") - if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then - log "Stopping existing monitor (pid=$old_pid)" - kill "$old_pid" 2>/dev/null || true - sleep 0.2 - fi - rm -f "$MONITOR_PID_FILE" 2>/dev/null || true - fi -} - -widen_and_monitor_klipper() { - main_pid=$(systemctl show -p MainPID --value klipper.service 2>/dev/null || echo 0) - [ "${main_pid:-0}" -gt 0 ] || return 0 - - # Kill any existing monitor before starting a new one - kill_existing_monitor - - children_file="/proc/$main_pid/task/$main_pid/children" - has_children_file=0 - [ -f "$children_file" ] && has_children_file=1 - - if [ "$has_children_file" = "0" ]; then - log "WARN: /proc children file unavailable (CONFIG_PROC_CHILDREN=n?); using fallback child detection" - fi - - # Widen cgroup to allow children on CPU 0 - if ! systemctl set-property --runtime klipper.service "AllowedCPUs=$KLIPPER_CPUS_WIDE" >/dev/null 2>&1; then - log "WARN: could not widen klipper cgroup; multiprocessing children will run on CPU $KLIPPER_MCU_TTY_CPU" - return 0 - fi +fi +log "CPU governor set to performance" - # Immediately pin all existing threads to CPU 3 (no race window) - taskset -apc "$KLIPPER_MCU_TTY_CPU" "$main_pid" >/dev/null 2>&1 || true - log "Widened klipper cgroup to $KLIPPER_CPUS_WIDE, pinned existing threads to CPU $KLIPPER_MCU_TTY_CPU" - - # Compute expected affinity mask for CPU 3 (used for comparison) - # taskset -p returns hex like "8" for CPU 3 - target_mask=$(printf '%x' $((1 << KLIPPER_MCU_TTY_CPU))) - - log "Starting klipper thread/child monitor for pid $main_pid (poll=${MONITOR_POLL_INTERVAL}s)" - - ( - # Write our PID to the file so we can be killed on re-run - echo $$ > "$MONITOR_PID_FILE" 2>/dev/null || true - - # Cleanup PID file on exit - trap 'rm -f "$MONITOR_PID_FILE" 2>/dev/null; exit 0' INT TERM EXIT - - seen_children="" - thread_check_interval=10 # Only re-check thread affinity every 10 iterations (1 second) - iteration=0 - - while kill -0 "$main_pid" 2>/dev/null; do - iteration=$((iteration + 1)) - - # --- Pin new/migrated threads in main process to CPU 3 --- - # Only check periodically to reduce overhead (threads don't drift often) - if [ "$((iteration % thread_check_interval))" = "0" ]; then - for tid_path in /proc/"$main_pid"/task/*; do - tid="${tid_path##*/}" - [ -d "$tid_path" ] || continue - [ "$tid" != "$main_pid" ] || continue # Skip main thread (already pinned) - - # Get current affinity via taskset (more reliable than parsing /proc) - # Output format: "pid 1234's current affinity mask: 8" - current=$(taskset -p "$tid" 2>/dev/null | sed 's/.*: //' || echo "") - - # Compare masks - only re-pin if different - if [ -n "$current" ] && [ "$current" != "$target_mask" ]; then - taskset -pc "$KLIPPER_MCU_TTY_CPU" "$tid" >/dev/null 2>&1 || true - fi - done - fi - - # --- Migrate and demote child processes --- - child_pids="" - - if [ "$has_children_file" = "1" ]; then - # Fast path: read from children file - child_pids=$(cat "$children_file" 2>/dev/null || true) - else - # Fallback: find processes whose parent is klipper - child_pids=$(pgrep -P "$main_pid" 2>/dev/null || true) - fi - - for cpid in $child_pids; do - [ -n "$cpid" ] || continue - - # Skip already-seen children - case " $seen_children " in - *" $cpid "*) continue ;; - esac - - # Verify it's a real child process (not a thread) - if [ -d "/proc/$cpid" ]; then - # Read Tgid and PPid in one pass for efficiency - proc_info=$(awk '/^Tgid:|^PPid:/{printf "%s ", $2}' "/proc/$cpid/status" 2>/dev/null || echo "") - tgid=$(echo "$proc_info" | awk '{print $1}') - ppid=$(echo "$proc_info" | awk '{print $2}') - - # Must be a process leader (tgid == pid) and child of klipper - if [ "$tgid" = "$cpid" ] && [ "$ppid" = "$main_pid" ]; then - - # CRITICAL: First demote from inherited SCHED_FIFO to SCHED_OTHER - # This MUST happen before renice (renice has no effect on RT processes) - if chrt -o -p 0 "$cpid" >/dev/null 2>&1; then - logger -t "$TAG" "klipper child $cpid demoted to SCHED_OTHER" - else - logger -t "$TAG" "WARN: failed to demote child $cpid from SCHED_FIFO" - fi - - # Now migrate to MISC_CPU - if taskset -apc "$MISC_CPU" "$cpid" >/dev/null 2>&1; then - logger -t "$TAG" "klipper child $cpid -> CPU $MISC_CPU" - fi - - # Apply nice and ionice (now that it's SCHED_OTHER, these work) - renice -n 15 -p "$cpid" >/dev/null 2>&1 || true - ionice -c2 -n 7 -p "$cpid" >/dev/null 2>&1 || true # best-effort class, low priority - - seen_children="$seen_children $cpid" - fi - fi - done - - # Clean up seen_children list (remove dead processes to prevent unbounded growth) - if [ "$((iteration % 100))" = "0" ]; then - new_seen="" - for cpid in $seen_children; do - [ -d "/proc/$cpid" ] && new_seen="$new_seen $cpid" - done - seen_children="$new_seen" - fi - - sleep "$MONITOR_POLL_INTERVAL" - done - - logger -t "$TAG" "klipper monitor exiting (main process gone)" - ) & - - monitor_pid=$! - - # Pin monitor to MISC_CPU with lowest priority - taskset -pc "$MISC_CPU" "$monitor_pid" >/dev/null 2>&1 || true - renice -n 19 -p "$monitor_pid" >/dev/null 2>&1 || true - ionice -c3 -p "$monitor_pid" >/dev/null 2>&1 || true - - log "Thread/child monitor running as pid $monitor_pid on CPU $MISC_CPU" -} +# ---------------- irqbalance neutralization ---------------- +if systemctl is-active --quiet irqbalance.service 2>/dev/null; then + systemctl stop irqbalance.service + systemctl mask irqbalance.service + log "irqbalance stopped and masked (run 'systemctl unmask irqbalance' to restore)" +fi -# --- wait for services ------------------------------------------------------- +# ---------------- Wait for services ---------------- for u in klipper.service klipper-mcu.service moonraker.service display.service; do wait_active "$u" done -# --- auto-detect multiprocessing plugins ------------------------------------- -MULTIPROCESSING_DETECTED=0 -if detect_multiprocessing_plugins; then - MULTIPROCESSING_DETECTED=1 - log "Multiprocessing plugin support enabled" -else - log "No multiprocessing plugins detected; using standard pinning" -fi - -# --- wait for UART IRQs ------------------------------------------------------ -wait_irq_present ttyS0 -wait_irq_present ttyS1 -wait_irq_present ttyS2 - -# --- pin UART IRQs ----------------------------------------------------------- -IRQ_S0="$(irq_for ttyS0 || true)" -IRQ_S1="$(irq_for ttyS1 || true)" -IRQ_S2="$(irq_for ttyS2 || true)" - -[ -n "${IRQ_S0:-}" ] && pin_irq "$IRQ_S0" "$KLIPPER_MCU_TTY_CPU" -[ -n "${IRQ_S1:-}" ] && pin_irq "$IRQ_S1" "$DISPLAY_TTY_CPU" -[ -n "${IRQ_S2:-}" ] && pin_irq "$IRQ_S2" "$MISC_CPU" - -# --- place units on CPUs (klipper gets single CPU initially) ----------------- -set_unit_cpus klipper-mcu.service "$KLIPPER_MCU_RPI_CPU" -set_unit_cpus klipper.service "$KLIPPER_MCU_TTY_CPU" -set_unit_cpus display.service "$DISPLAY_TTY_CPU" -set_unit_cpus moonraker.service "$MISC_CPU" -set_unit_cpus mjpg-streamer-webcam1.service "$MISC_CPU" || true -set_unit_cpus mobileraker.service "$MISC_CPU" || true -set_unit_cpus power_monitor.service "$MISC_CPU" || true - -# --- serial tuning for /dev/ttyS0 ------------------------------------------- -if [ -e /dev/ttyS0 ]; then - have setserial && setserial /dev/ttyS0 low_latency || true - stty -F /dev/ttyS0 cs8 -parenb -cstopb -ixon -ixoff -crtscts \ - -icanon -echo -echoe -echok -echoctl -echoke -iexten \ - -inlcr -igncr -icrnl -opost -hupcl min 1 time 0 || true -else - log "WARN: /dev/ttyS0 not present; skipped stty/setserial" -fi - -# --- make display gentle ----------------------------------------------------- +# ---------------- IRQ discovery (polling) ---------------- +log "Polling for UART IRQs..." +IRQ_S0="$(poll_irq ttyS0 10)" +IRQ_S1="$(poll_irq ttyS1 5)" +IRQ_S2="$(poll_irq ttyS2 5)" + +[ -n "$IRQ_S0" ] || log "WARN: ttyS0 IRQ not found after polling" +[ -n "$IRQ_S1" ] || log "WARN: ttyS1 IRQ not found after polling" +[ -n "$IRQ_S2" ] || log "WARN: ttyS2 IRQ not found after polling" + +# ---------------- IRQ affinity ---------------- +[ -n "$IRQ_S0" ] && pin_irq "$IRQ_S0" "$KLIPPER_CPU" +[ -n "$IRQ_S1" ] && pin_irq "$IRQ_S1" "$DISPLAY_CPU" +[ -n "$IRQ_S2" ] && pin_irq "$IRQ_S2" "$MISC_CPU" + +# ---------------- CPU affinity for services ---------------- +set_unit_cpus klipper.service "$KLIPPER_CPU" +set_unit_cpus klipper-mcu.service "$KLIPPER_MCU_CPU" +set_unit_cpus display.service "$DISPLAY_CPU" +set_unit_cpus moonraker.service "$MISC_CPU" +set_unit_cpus mjpg-streamer-webcam1.service "$MISC_CPU" 2>/dev/null || true +set_unit_cpus mobileraker.service "$MISC_CPU" 2>/dev/null || true +set_unit_cpus power_monitor.service "$MISC_CPU" 2>/dev/null || true + +# ---------------- Scheduler demotion (optional) ---------------- +demote_to_cfs klipper.service +demote_to_cfs klipper-mcu.service + +# ---------------- Priority tuning (CFS nice values) ---------------- +renice_unit klipper.service -18 +renice_unit klipper-mcu.service -10 + +# ---------------- Make display/UI gentle ---------------- renice_unit display.service 19 ionice_idle_unit display.service -# --- RT budget --------------------------------------------------------------- -sysctl -w kernel.sched_rt_runtime_us=-1 >/dev/null 2>&1 || \ - echo -1 > /proc/sys/kernel/sched_rt_runtime_us 2>/dev/null || \ - log "WARN: failed to set sched_rt_runtime_us" - -# --- promote Klippy + host MCU to SCHED_FIFO 60 (sticky) --------------------- -promote_unit_fifo klipper-mcu.service 60 -promote_unit_fifo klipper.service 60 - -# --- multiprocessing support (only if plugin detected) ----------------------- -# Done AFTER RT promotion to avoid race window -if [ "$MULTIPROCESSING_DETECTED" = "1" ]; then - widen_and_monitor_klipper +# ---------------- Serial low-latency tuning ---------------- +if [ -c /dev/ttyS0 ]; then + have setserial && setserial /dev/ttyS0 low_latency 2>/dev/null || true + stty -F /dev/ttyS0 raw -echo -ixon -ixoff min 1 time 0 2>/dev/null || true +else + log "WARN: /dev/ttyS0 not a character device" fi -# --- bump ttyS0 IRQ thread if present ---------------------------------------- -if [ -n "${IRQ_S0:-}" ]; then - chrt_irq_thread "$IRQ_S0" 70 -fi +# ---------------- Summary ---------------- +log "ttyS0 irq=${IRQ_S0:-?}" +log "ttyS1 irq=${IRQ_S1:-?}" +log "ttyS2 irq=${IRQ_S2:-?}" -# --- summary ----------------------------------------------------------------- -if [ -n "${IRQ_S0:-}" ]; then - aff0=$(cat "/proc/irq/$IRQ_S0/smp_affinity_list" 2>/dev/null || echo "?") - log "ttyS0 irq=$IRQ_S0 aff=$aff0" -fi -if [ -n "${IRQ_S1:-}" ]; then - aff1=$(cat "/proc/irq/$IRQ_S1/smp_affinity_list" 2>/dev/null || echo "?") - log "ttyS1 irq=$IRQ_S1 aff=$aff1" -fi -if [ -n "${IRQ_S2:-}" ]; then - aff2=$(cat "/proc/irq/$IRQ_S2/smp_affinity_list" 2>/dev/null || echo "?") - log "ttyS2 irq=$IRQ_S2 aff=$aff2" +log "klipper: $(ps_line "$(systemctl show -p MainPID --value klipper.service 2>/dev/null || echo 0)")" +log "klipper-mcu: $(ps_line "$(systemctl show -p MainPID --value klipper-mcu.service 2>/dev/null || echo 0)")" + +if [ "$DEMOTE_FIFO_TO_CFS" = "yes" ]; then + log "done (CFS mode forced, governor locked, irqbalance disabled)" +else + log "done (scheduler unchanged, governor locked, irqbalance disabled)" fi -log "klipper-mcu:$(ps_line "$(systemctl show -p MainPID --value klipper-mcu.service 2>/dev/null || echo 0)")" -log "klipper: $(ps_line "$(systemctl show -p MainPID --value klipper.service 2>/dev/null || echo 0)")" -log "done (IRQs: ttyS0:${IRQ_S0:-?} ttyS1:${IRQ_S1:-?} ttyS2:${IRQ_S2:-?}, multiprocessing=$MULTIPROCESSING_DETECTED)" exit 0 From 39ba26fbf6a65c288cdc4e468ebcc2d046f1240b Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Tue, 3 Feb 2026 21:30:18 +0000 Subject: [PATCH 11/17] Refactor kamp page display logic and cleanup conditions --- display.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/display.py b/display.py index 531a3f5..622b7b5 100644 --- a/display.py +++ b/display.py @@ -446,8 +446,9 @@ async def special_page_handling(self, current_page): await self.display.draw_initial_zprobe_leveling(self.z_probe_step, self.z_probe_distance) self._loop.create_task(self.handle_zprobe_leveling()) elif current_page == PAGE_PRINTING_KAMP: - await self.display.draw_kamp_page(self.bed_leveling_counts) - return + if not self._rapid_scan_mode: + await self.display.draw_kamp_page(self.bed_leveling_counts) + return await self.display.special_page_handling(current_page) @@ -2066,7 +2067,7 @@ async def handle_gcode_response(self, response): current_page = await self._get_current_page() if current_page != PAGE_PRINTING_KAMP: self._loop.create_task(self._navigate_to_page(PAGE_PRINTING_KAMP, clear_history=True)) - elif "Beginning rapid surface scan" in response or "[cartographer] Starting stream" in response: + elif "Beginning rapid surface scan" in response or "Touch home at" in response: # Rapid scan mode (Eddy/Cartographer/Beacon) - these probes don't send # "Adapted mesh bounds" or "probe: at" messages, so we handle everything here self._rapid_scan_mode = True @@ -2117,25 +2118,13 @@ async def handle_gcode_response(self, response): ) ) - elif response.startswith("// Mesh Bed Leveling Complete") or "[cartographer] Collecting samples along the scanning path completed" in response: + elif response.startswith("// Mesh Bed Leveling Complete") or "Collecting samples along the scanning path completed" in response: # If rapid scan mode was active, show completion # Draw boxes if we received probe counts (some probes send both rapid scan AND counts) if self._rapid_scan_mode: - if self.bed_leveling_counts[0] > 0: - # We have counts, draw all boxes green - total_probes = self.bed_leveling_counts[0] * self.bed_leveling_counts[1] - for i in range(total_probes): - self._loop.create_task( - self.display.draw_kamp_box_index(i, BACKGROUND_SUCCESS, self.bed_leveling_counts) - ) - self._loop.create_task( - self.display.update_kamp_text(f"Scan complete! ({total_probes} points)") - ) - else: - # No counts available, just show completion text - self._loop.create_task( - self.display.update_kamp_text("Scan complete!") - ) + self._loop.create_task( + self.display.update_kamp_text("Scan complete!") + ) self.bed_leveling_probed_count = 0 self.bed_leveling_counts = self.full_bed_leveling_counts From 2eeab40097894c74c91284af161ef1aa9ed7a5d6 Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Wed, 4 Feb 2026 15:02:28 +0000 Subject: [PATCH 12/17] Improve file locking and error handling Refactor file locking and error handling in display.py for better concurrency and reliability. --- display.py | 154 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 94 insertions(+), 60 deletions(-) diff --git a/display.py b/display.py index 622b7b5..39760ff 100644 --- a/display.py +++ b/display.py @@ -711,9 +711,10 @@ async def _handle_file_selection(): selected = self.dir_contents[(self.files_page * 5) + index] is_dir = selected["type"] == "dir" file_path = selected["path"] + if is_dir: + self.current_dir = file_path + self.files_page = 0 if is_dir: - self.current_dir = file_path - self.files_page = 0 await self._load_files() else: async with self._filename_lock: @@ -996,7 +997,7 @@ async def _load_files(self): {"path": "/".join(["gcodes", self.current_dir])}, ) dir_info = data["result"] - self.dir_contents = [] + dirs = [] for item in dir_info["dirs"]: if not item["dirname"].startswith("."): @@ -1026,15 +1027,18 @@ async def _load_files(self): sort_folders_first = self.config["files"].getboolean( "sort_folders_first", fallback=True ) - if sort_folders_first: - self.dir_contents = self.sort_dir_contents(dirs) + self.sort_dir_contents( - files - ) - else: - self.dir_contents = self.sort_dir_contents(dirs + files) - await self.display.show_files_page( - self.current_dir, self.dir_contents, self.files_page - ) + + # Update shared state under lock + async with self._files_lock: + if sort_folders_first: + self.dir_contents = self.sort_dir_contents(dirs) + self.sort_dir_contents(files) + else: + self.dir_contents = self.sort_dir_contents(dirs + files) + current_dir = self.current_dir + dir_contents = self.dir_contents + files_page = self.files_page + + await self.display.show_files_page(current_dir, dir_contents, files_page) def _page_id(self, page): return self.display.mapper.map_page(page) @@ -1055,17 +1059,22 @@ async def _go_back(self): history_len = len(self.history) if history_len <= 1: logger.debug("Already at the main page.") - return # ← This is the ONLY place this message should appear + return # Get current page WITHOUT releasing lock (safe - no await) current_page = self.history[-1] if self.history else None - # PHASE 2: Handle FILES special case (outside lock) - if current_page == PAGE_FILES and self.current_dir != "": - self.current_dir = "/".join(self.current_dir.split("/")[:-1]) - self.files_page = 0 - await self._load_files() # I/O outside lock - return + # PHASE 2: Handle FILES special case (with proper file locking) + if current_page == PAGE_FILES: + should_load_files = False + async with self._files_lock: + if self.current_dir != "": + self.current_dir = "/".join(self.current_dir.split("/")[:-1]) + self.files_page = 0 + should_load_files = True + if should_load_files: + await self._load_files() + return # PHASE 3: Pop history and determine navigation (under lock - DATA ONLY) back_page = None @@ -1088,15 +1097,12 @@ async def _go_back(self): # Map page under lock (it's just a dict lookup - fast and safe) mapped_page = self.display.mapper.map_page(back_page) - # PHASE 4: Navigate (outside lock - I/O) - if back_page is None or mapped_page is None: - logger.debug("No valid page to navigate back to.") - return - - await self.display.navigate_to(mapped_page) - logger.debug(f"Navigating back to {back_page}") + # PHASE 4: Perform navigation (outside lock - I/O) + if back_page is not None and mapped_page is not None: + await self.display.navigate_to(mapped_page) + logger.debug(f"Navigating back to {back_page}") - # PHASE 5: Special page handling (outside lock - I/O) + # PHASE 5: Handle special page logic (outside lock - may do I/O) try: await self.special_page_handling(back_page) except Exception as e: @@ -1245,6 +1251,9 @@ async def _cleanup_stale_requests(self): await asyncio.sleep(60) async def _send_moonraker_request(self, method, params=None): + if not self.connected or self.writer is None: + raise ConnectionError("Not connected to Moonraker") + if params is None: params = {} message = self._make_rpc_msg(method, **params) @@ -1263,13 +1272,22 @@ async def _send_moonraker_request(self, method, params=None): self.writer.write(data) await self.writer.drain() return await asyncio.wait_for(fut, timeout=self.REQUEST_TIMEOUT) - except asyncio.TimeoutError: + except asyncio.CancelledError: + # External cancellation (shutdown, reconnect) - always propagate + async with self.pending_reqs_lock: + self.pending_reqs.pop(message["id"], None) + raise + except (asyncio.TimeoutError, ConnectionError): + # Timeout or connection closed - let caller handle reconnect logic async with self.pending_reqs_lock: self.pending_reqs.pop(message["id"], None) raise except Exception: + # Unexpected error - log but don't call close() here + # Let the error propagate; caller/listen() handles reconnection logger.exception("Unexpected error _send_moonraker_request") - await self.close() + async with self.pending_reqs_lock: + self.pending_reqs.pop(message["id"], None) raise def _find_ips(self, network): @@ -1437,21 +1455,23 @@ async def _process_stream(self, reader: asyncio.StreamReader) -> None: errors_remaining: int = 10 while not reader.at_eof(): if self.klipper_restart_event.is_set(): - await self._attempt_reconnect() self.klipper_restart_event.clear() + await self._attempt_reconnect() + return # Exit - new connection started try: data = await reader.readuntil(b"\x03") decoded = data[:-1].decode(encoding="utf-8") item = json.loads(decoded) except (ConnectionError, asyncio.IncompleteReadError): await self._attempt_reconnect() - break + return # Exit - new connection started except asyncio.CancelledError: raise except Exception: errors_remaining -= 1 if not errors_remaining or not self.connected: await self._attempt_reconnect() + return # Exit - new connection started continue errors_remaining = 10 if "id" in item: @@ -1460,10 +1480,10 @@ async def _process_stream(self, reader: asyncio.StreamReader) -> None: if request_data is not None: fut, _ = request_data fut.set_result(item) - elif item["method"] == "notify_status_update": + elif item.get("method") == "notify_status_update": await self.handle_status_update(item["params"][0]) - elif item["method"] == "notify_gcode_response": - await self.handle_gcode_response(item["params"][0]) + elif item.get("method") == "notify_gcode_response": + await self.handle_gcode_response(item["params"][0]) logger.info("Unix Socket Disconnection from _process_stream()") await self.close() @@ -1501,29 +1521,25 @@ def safe_int_convert(value, default=0): async def _attempt_reconnect(self): async with self._reconnect_lock: if self._is_reconnecting: - logger.debug("Reconnection already in progress, skipping...") return self._is_reconnecting = True try: - # Close existing connection - if self.writer and not self.writer.is_closing(): + # Cancel existing listen task first + if self._listen_task and not self._listen_task.done(): + self._listen_task.cancel() try: - self.writer.close() - await self.writer.wait_closed() - except Exception as e: - logger.debug(f"Error closing writer: {e}") + await self._listen_task + except asyncio.CancelledError: + pass - self.connected = False + # Close existing connection + await self.close() logger.info("Attempting to reconnect to Moonraker...") await asyncio.sleep(1) - # Clear the listening flag if it's stuck self._is_listening = False - - # Allow thread pool recreation self.resources.allow_new_pool() - self.start_listening() finally: self._is_reconnecting = False @@ -1862,19 +1878,28 @@ async def handle_status_update(self, new_data, data_mapping=None): # Load thumbnail if needed if should_load_thumbnail and thumbnail_filename: logger.info(f"Loading thumbnail for {thumbnail_filename} on printing page") - if self._thumbnail_task and not self._thumbnail_task.done(): - self._thumbnail_task.cancel() + + # Cancel existing task under lock + task_to_cancel = None + async with self._filename_lock: + if self._thumbnail_task and not self._thumbnail_task.done(): + task_to_cancel = self._thumbnail_task + self._thumbnail_task = None + + if task_to_cancel: + task_to_cancel.cancel() try: - await self._thumbnail_task + await task_to_cancel except asyncio.CancelledError: pass - self._thumbnail_task = self._loop.create_task( - self.load_thumbnail_for_page( - thumbnail_filename, - thumbnail_page_id + async with self._filename_lock: + self._thumbnail_task = self._loop.create_task( + self.load_thumbnail_for_page( + thumbnail_filename, + thumbnail_page_id + ) ) - ) elif state_to_process == "complete": if current_page is None or current_page != PAGE_PRINTING_COMPLETE: @@ -1992,13 +2017,22 @@ async def close(self): return self.connected = False - # Cancel the process_stream task if it's still running + # Cancel all pending request futures + async with self.pending_reqs_lock: + for req_id, (fut, _) in list(self.pending_reqs.items()): + if not fut.done(): + fut.set_exception(ConnectionError("Connection closed")) + self.pending_reqs.clear() + + # Cancel process_stream task only if it's not the current task (avoid self-cancellation) + current_task = asyncio.current_task() if self._process_stream_task and not self._process_stream_task.done(): - self._process_stream_task.cancel() - try: - await self._process_stream_task - except asyncio.CancelledError: - pass + if self._process_stream_task is not current_task: + self._process_stream_task.cancel() + try: + await self._process_stream_task + except asyncio.CancelledError: + pass if self.writer: self.writer.close() From 536035e869d632bcc3de313dc77f518bcef3e11f Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Wed, 25 Feb 2026 11:36:24 +0000 Subject: [PATCH 13/17] Make display updates self-apply and simplify affinity tuning --- affinity-setup.sh | 217 +++++++---------------------------- affinity.service | 16 +-- display-service-installer.sh | 3 +- display.service | 7 ++ 4 files changed, 55 insertions(+), 188 deletions(-) mode change 100644 => 100755 affinity-setup.sh diff --git a/affinity-setup.sh b/affinity-setup.sh old mode 100644 new mode 100755 index be71b53..998d9c8 --- a/affinity-setup.sh +++ b/affinity-setup.sh @@ -1,204 +1,69 @@ #!/bin/sh -# Klipper stack affinity + low-jitter tuning (MCU-safe, PREEMPT_RT aware) -# Implements: CPU governor lock, irqbalance neutralization, polling IRQ discovery -# Avoids: FIFO promotion, RT throttle disable, kernel isolation +# Minimal display affinity helper: +# - optionally set CPU governor to performance +# - de-prioritize display.service so UI work is less likely to interfere +# with Klipper/Moonraker on low-power SBCs set -eu -TAG="klipper-affinity-safe" +TAG="display-affinity-minimal" +ENABLE_PERFORMANCE_GOVERNOR="${ENABLE_PERFORMANCE_GOVERNOR:-yes}" + log() { - logger -t "$TAG" -- "$@" + logger -t "$TAG" -- "$@" 2>/dev/null || true printf '%s: %s\n' "$TAG" "$*" } -# ---------------- Re-exec as root ---------------- +have() { command -v "$1" >/dev/null 2>&1; } + +# Re-exec as root because systemd may call this via affinity.service. if [ "$(id -u)" != 0 ]; then exec sudo -E -- "$0" "$@" fi -# ---------------- CPU layout (0-based) ---------------- -MISC_CPU=0 # moonraker, webcam, power monitor, ttyS2 -DISPLAY_CPU=1 # display.service, ttyS1 -KLIPPER_CPU=2 # klipper.service + ttyS0 IRQ -KLIPPER_MCU_CPU=3 # klipper-mcu.service - -# ---- Optional: co-locate klipper-mcu with klipper (cache locality vs contention) -# KLIPPER_MCU_CPU=$KLIPPER_CPU - -# ---- Optional: demote FIFO processes to CFS before renicing -# Set to "yes" to force CFS scheduling (safer), "no" to leave existing scheduler -DEMOTE_FIFO_TO_CFS="no" - -# ---------------- Helpers ---------------- -have() { command -v "$1" >/dev/null 2>&1; } - -for bin in systemctl awk ps sed taskset ionice renice stty chrt cpupower logger; do - have "$bin" || log "WARN: missing helper '$bin'" -done - -cpu_online() { - c="$1" - [ -d "/sys/devices/system/cpu/cpu$c" ] || return 1 - onf="/sys/devices/system/cpu/cpu$c/online" - [ ! -f "$onf" ] || [ "$(cat "$onf")" = "1" ] -} - -wait_active() { - unit="$1"; t=0 - while [ "$t" -lt 30 ]; do - pid=$(systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0) - [ "$pid" -gt 0 ] && return 0 - sleep 0.5 - t=$((t+1)) - done - log "WARN: $unit did not become active" -} - -irq_for() { - awk -v name="$1" '$NF==name{gsub(":", "", $1); print $1; exit}' /proc/interrupts -} - -# Poll for IRQ to appear (handles lazy IRQ registration) -poll_irq() { - dev="$1"; timeout="${2:-10}"; t=0 - irq="" - while [ "$t" -lt "$timeout" ]; do - irq="$(irq_for "$dev" || true)" - [ -n "$irq" ] && break - sleep 1 - t=$((t+1)) - done - echo "$irq" -} - -pin_irq() { - irq="$1"; cpu="$2" - [ -n "$irq" ] || return 0 - cpu_online "$cpu" || return 0 - echo "$cpu" > "/proc/irq/$irq/smp_affinity_list" 2>/dev/null || true - log "Pinned IRQ $irq -> CPU $cpu" -} - -set_unit_cpus() { - unit="$1"; cpu="$2" - cpu_online "$cpu" || return 0 - if ! systemctl set-property --runtime "$unit" AllowedCPUs="$cpu" >/dev/null 2>&1; then - pid=$(systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0) - [ "$pid" -gt 0 ] && taskset -pc "$cpu" "$pid" >/dev/null 2>&1 || true - fi - log "Pinned $unit -> CPU $cpu" -} - -# Demote RT (FIFO/RR) to CFS if enabled -demote_to_cfs() { +mainpid() { unit="$1" - [ "$DEMOTE_FIFO_TO_CFS" = "yes" ] || return 0 - pid=$(systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0) - [ "$pid" -gt 0 ] || return 0 - # Check current scheduling class - cls=$(ps -o cls= -p "$pid" 2>/dev/null || echo "TS") - case "$cls" in - FF|RR) - chrt -o 0 -p "$pid" 2>/dev/null && log "Demoted $unit from $cls to CFS" || true - ;; - esac + systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0 } renice_unit() { - unit="$1"; nice="$2" - pid=$(systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0) - [ "$pid" -gt 0 ] && renice "$nice" -p "$pid" >/dev/null 2>&1 || true + unit="$1" + nice_val="$2" + pid="$(mainpid "$unit")" + [ "$pid" -gt 0 ] || return 0 + renice "$nice_val" -p "$pid" >/dev/null 2>&1 || true } ionice_idle_unit() { unit="$1" - pid=$(systemctl show -p MainPID --value "$unit" 2>/dev/null || echo 0) - [ "$pid" -gt 0 ] && ionice -c3 -p "$pid" >/dev/null 2>&1 || true + pid="$(mainpid "$unit")" + [ "$pid" -gt 0 ] || return 0 + have ionice || return 0 + ionice -c3 -p "$pid" >/dev/null 2>&1 || true } -ps_line() { - pid="$1" - ps -o pid,cls,rtprio,ni,psr,cmd -p "$pid" --no-headers 2>/dev/null | sed 's/^/ /' +set_performance_governor() { + [ "$ENABLE_PERFORMANCE_GOVERNOR" = "yes" ] || { + log "Skipping CPU governor change" + return 0 + } + + if have cpupower; then + cpupower frequency-set -g performance >/dev/null 2>&1 || true + else + for g in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do + [ -w "$g" ] && echo performance > "$g" 2>/dev/null || true + done + fi + log "CPU governor set to performance (best effort)" } -# ---------------- CPU governor: performance ---------------- -if have cpupower; then - cpupower frequency-set -g performance >/dev/null 2>&1 || true -else - for g in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do - [ -w "$g" ] && echo performance > "$g" 2>/dev/null || true - done -fi -log "CPU governor set to performance" - -# ---------------- irqbalance neutralization ---------------- -if systemctl is-active --quiet irqbalance.service 2>/dev/null; then - systemctl stop irqbalance.service - systemctl mask irqbalance.service - log "irqbalance stopped and masked (run 'systemctl unmask irqbalance' to restore)" -fi - -# ---------------- Wait for services ---------------- -for u in klipper.service klipper-mcu.service moonraker.service display.service; do - wait_active "$u" -done - -# ---------------- IRQ discovery (polling) ---------------- -log "Polling for UART IRQs..." -IRQ_S0="$(poll_irq ttyS0 10)" -IRQ_S1="$(poll_irq ttyS1 5)" -IRQ_S2="$(poll_irq ttyS2 5)" - -[ -n "$IRQ_S0" ] || log "WARN: ttyS0 IRQ not found after polling" -[ -n "$IRQ_S1" ] || log "WARN: ttyS1 IRQ not found after polling" -[ -n "$IRQ_S2" ] || log "WARN: ttyS2 IRQ not found after polling" +set_performance_governor -# ---------------- IRQ affinity ---------------- -[ -n "$IRQ_S0" ] && pin_irq "$IRQ_S0" "$KLIPPER_CPU" -[ -n "$IRQ_S1" ] && pin_irq "$IRQ_S1" "$DISPLAY_CPU" -[ -n "$IRQ_S2" ] && pin_irq "$IRQ_S2" "$MISC_CPU" - -# ---------------- CPU affinity for services ---------------- -set_unit_cpus klipper.service "$KLIPPER_CPU" -set_unit_cpus klipper-mcu.service "$KLIPPER_MCU_CPU" -set_unit_cpus display.service "$DISPLAY_CPU" -set_unit_cpus moonraker.service "$MISC_CPU" -set_unit_cpus mjpg-streamer-webcam1.service "$MISC_CPU" 2>/dev/null || true -set_unit_cpus mobileraker.service "$MISC_CPU" 2>/dev/null || true -set_unit_cpus power_monitor.service "$MISC_CPU" 2>/dev/null || true - -# ---------------- Scheduler demotion (optional) ---------------- -demote_to_cfs klipper.service -demote_to_cfs klipper-mcu.service - -# ---------------- Priority tuning (CFS nice values) ---------------- -renice_unit klipper.service -18 -renice_unit klipper-mcu.service -10 - -# ---------------- Make display/UI gentle ---------------- -renice_unit display.service 19 +# Keep the UI process gentle. Do not touch klipper/klipper-mcu scheduling, +# CPU affinity, IRQ affinity, or serial driver tuning here. +renice_unit display.service 19 ionice_idle_unit display.service - -# ---------------- Serial low-latency tuning ---------------- -if [ -c /dev/ttyS0 ]; then - have setserial && setserial /dev/ttyS0 low_latency 2>/dev/null || true - stty -F /dev/ttyS0 raw -echo -ixon -ixoff min 1 time 0 2>/dev/null || true -else - log "WARN: /dev/ttyS0 not a character device" -fi - -# ---------------- Summary ---------------- -log "ttyS0 irq=${IRQ_S0:-?}" -log "ttyS1 irq=${IRQ_S1:-?}" -log "ttyS2 irq=${IRQ_S2:-?}" - -log "klipper: $(ps_line "$(systemctl show -p MainPID --value klipper.service 2>/dev/null || echo 0)")" -log "klipper-mcu: $(ps_line "$(systemctl show -p MainPID --value klipper-mcu.service 2>/dev/null || echo 0)")" - -if [ "$DEMOTE_FIFO_TO_CFS" = "yes" ]; then - log "done (CFS mode forced, governor locked, irqbalance disabled)" -else - log "done (scheduler unchanged, governor locked, irqbalance disabled)" -fi +log "Applied gentle priority tuning to display.service" exit 0 diff --git a/affinity.service b/affinity.service index 8b630c6..9d9bf91 100644 --- a/affinity.service +++ b/affinity.service @@ -1,20 +1,16 @@ [Unit] -Description=Pin UART IRQs and set CPU affinities/priorities for Klipper stack (dynamic detection) +Description=Apply minimal display priority tuning -# Start after these are up, and pull them in if needed -After=klipper-mcu.service klipper.service display.service moonraker.service -Wants=klipper-mcu.service klipper.service display.service moonraker.service +# Run after display is up so the script can tune display.service safely. +After=display.service -# If any of these restart/stop, restart this oneshot too -PartOf=klipper-mcu.service klipper.service display.service +# Re-run when display.service is restarted/stopped. +PartOf=display.service [Service] Type=oneshot RemainAfterExit=yes -ExecStart=/usr/local/sbin/affinity-setup.sh +ExecStart=/home/mks/display_connector/affinity-setup.sh [Install] -WantedBy=multi-user.target -WantedBy=klipper-mcu.service -WantedBy=klipper.service WantedBy=display.service diff --git a/display-service-installer.sh b/display-service-installer.sh index dfc4b73..7bd8fb5 100755 --- a/display-service-installer.sh +++ b/display-service-installer.sh @@ -58,9 +58,8 @@ echo "Reloading systemd units..." sudo systemctl daemon-reload # --- Enable + start services ------------------------------------------------- -echo "Enabling and starting affinity.service..." +echo "Enabling affinity.service..." sudo systemctl enable affinity.service -sudo systemctl start affinity.service echo "Enabling and starting display.service..." sudo systemctl enable display.service diff --git a/display.service b/display.service index 6eee02d..41c8721 100644 --- a/display.service +++ b/display.service @@ -9,6 +9,13 @@ Type=simple User=mks Group=mks WorkingDirectory=/home/mks/display_connector +# Self-sync systemd units from the repo on startup so Moonraker-triggered restarts +# can apply updated unit files without re-running the installer manually. +ExecStartPre=+/usr/bin/install -m 0644 /home/mks/display_connector/affinity.service /etc/systemd/system/affinity.service +ExecStartPre=+/usr/bin/install -m 0644 /home/mks/display_connector/display.service /etc/systemd/system/display.service +ExecStartPre=+/bin/chmod 0755 /home/mks/display_connector/affinity-setup.sh +ExecStartPre=+/bin/systemctl daemon-reload +ExecStartPre=+/bin/systemctl enable affinity.service ExecStartPre=/bin/sleep 10 ExecStart=/home/mks/display_connector/venv/bin/python /home/mks/display_connector/display.py From b68383f26cfbb961fa09d96f2aa6495c4846a873 Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Wed, 25 Feb 2026 12:05:43 +0000 Subject: [PATCH 14/17] Harden display async lifecycle and prevent page/thumbnail write races --- display.py | 31 ++++++++++++++++++++++--------- src/communicator.py | 12 +++++++----- src/elegoo_display.py | 20 +++++++++++++------- 3 files changed, 42 insertions(+), 21 deletions(-) diff --git a/display.py b/display.py index 39760ff..e59fb5c 100644 --- a/display.py +++ b/display.py @@ -1206,9 +1206,10 @@ async def listen(self): logger.error(f"Unexpected response format from printer.objects.subscribe: {ret}") raise Exception("Failed to subscribe to printer objects") - # Now wait for the process_stream task to complete (keeps connection alive) + # Keep the listen task alive while the Moonraker stream reader runs. logger.info("Listen task now monitoring connection...") if self._process_stream_task: + await asyncio.shield(self._process_stream_task) logger.info("Process stream task completed, connection closed") else: logger.warning("No process_stream task found!") @@ -1332,6 +1333,7 @@ async def connect_moonraker(self) -> None: logger.error( "KeyError encountered in software_version_response. Attempting to reconnect." ) + await self.close() await asyncio.sleep(5) # Wait before reconnecting continue # Retry the connection loop @@ -1339,6 +1341,7 @@ async def connect_moonraker(self) -> None: raise except Exception as e: logger.error(f"Error connecting to Moonraker: {e}") + await self.close() await asyncio.sleep(5) # Wait before reconnecting continue @@ -1479,7 +1482,8 @@ async def _process_stream(self, reader: asyncio.StreamReader) -> None: request_data = self.pending_reqs.pop(item["id"], None) if request_data is not None: fut, _ = request_data - fut.set_result(item) + if not fut.done(): + fut.set_result(item) elif item.get("method") == "notify_status_update": await self.handle_status_update(item["params"][0]) elif item.get("method") == "notify_gcode_response": @@ -2299,22 +2303,31 @@ async def watchdog_ping(): if config_observer.is_alive(): config_observer.join(timeout=5) - # Ensure all tasks are cancelled (Py 3.13 safe) + # Best-effort controller shutdown before closing the loop. + if "controller" in locals() and not loop.is_closed(): + try: + loop.run_until_complete(controller.close()) + except Exception as e: + logger.error(f"Error during controller close(): {e}") + + # Ensure all tasks are cancelled and given time to handle CancelledError. try: - asyncio.get_running_loop() # raises if no loop is running - pending = asyncio.all_tasks() - except RuntimeError: + pending = {t for t in asyncio.all_tasks(loop=loop) if not t.done()} + except Exception: pending = set() for task in list(pending): task.cancel() - if pending: + if pending and not loop.is_closed(): try: - loop.run_until_complete(asyncio.wait(pending, timeout=5)) + loop.run_until_complete( + asyncio.gather(*pending, return_exceptions=True) + ) + loop.run_until_complete(loop.shutdown_asyncgens()) except RuntimeError: # Loop may already be closed or not runnable here; best-effort shutdown. pass - + loop.close() logger.info("Service stopped") diff --git a/src/communicator.py b/src/communicator.py index 5481b44..518ea3a 100644 --- a/src/communicator.py +++ b/src/communicator.py @@ -68,7 +68,7 @@ async def _execute_command(self, data, timeout=None): # Any other error: log and continue self.logger.warning(f"Unexpected error writing to display: {e}") - async def write(self, data, timeout=None, blocked_key=None): + async def write(self, data, timeout=None, blocked_key=None, auto_unblock=True): # Fast path: decide blocking under lock async with self._write_lock: # If someone else is blocking, queue this command @@ -85,7 +85,7 @@ async def write(self, data, timeout=None, blocked_key=None): await self._execute_command(data, timeout) finally: # If this was a blocking op, release block and send the next queued command (if any) - if blocked_key: + if blocked_key and auto_unblock: await self.unblock(blocked_key) async def unblock(self, blocked_key): @@ -127,9 +127,11 @@ async def retrieve_nested_data(self, path): async def navigate_to(self, page_id): # Block other writes while we switch pages - await self.write(f"page {page_id}", blocked_key="__nav__") - await asyncio.sleep(0.25) # give the HMI time to swap pages - await self.unblock("__nav__") + try: + await self.write(f"page {page_id}", blocked_key="__nav__", auto_unblock=False) + await asyncio.sleep(0.25) # give the HMI time to swap pages + finally: + await self.unblock("__nav__") async def update_data(self, new_data, data_mapping=None, current_data=None): if data_mapping is None: diff --git a/src/elegoo_display.py b/src/elegoo_display.py index 82f1f93..f4b37c9 100644 --- a/src/elegoo_display.py +++ b/src/elegoo_display.py @@ -338,6 +338,7 @@ def __init__(self, logger: Logger, model: str, port: str, event_handler, baudrat self._cached_wifi_status = None self._last_wifi_check = 0 self._wifi_check_interval = 30 # Only check WiFi every 30 seconds + self._warning_task = None async def get_firmware_version(self) -> str: if self._firmware_version is None: # Check if the firmware version is cached @@ -345,10 +346,13 @@ async def get_firmware_version(self) -> str: return self._firmware_version async def send_warning_message(self): - await asyncio.sleep(0.6) # Add delay if needed - await self.write( - f'xstr 0,464,320,16,2,{TEXT_WARNING},{BACKGROUND_GRAY},1,1,1,"WARNING: Unsupported Display Firmware Version"' - ) + try: + await asyncio.sleep(0.6) # Add delay if needed + await self.write( + f'xstr 0,464,320,16,2,{TEXT_WARNING},{BACKGROUND_GRAY},1,1,1,"WARNING: Unsupported Display Firmware Version"' + ) + finally: + self._warning_task = None async def check_valid_version(self): version = await self.get_firmware_version() @@ -361,7 +365,8 @@ async def check_valid_version(self): "Unsupported firmware version. Consider updating to a supported version: " + ", ".join(self.supported_firmware_versions) ) - asyncio.create_task(self.send_warning_message()) # Send warning asynchronously + if self._warning_task is None or self._warning_task.done(): + self._warning_task = asyncio.create_task(self.send_warning_message()) return False return True @@ -555,7 +560,7 @@ async def update_wifi_ui(self): if self._cached_wifi_status is not None and (current_time - self._last_wifi_check) < self._wifi_check_interval: has_wifi, ssid, rssi_category = self._cached_wifi_status else: - has_wifi, ssid, rssi_category = get_wlan0_status() + has_wifi, ssid, rssi_category = await asyncio.to_thread(get_wlan0_status) self._cached_wifi_status = (has_wifi, ssid, rssi_category) self._last_wifi_check = current_time @@ -769,7 +774,8 @@ async def display_thumbnail(self, page_number, thumbnail): for part in parts: await self.write( str(page_number) + '.cp0.write("' + str(part) + '")', - blocked_key=f"thumbnail_{page_number}" + blocked_key=f"thumbnail_{page_number}", + auto_unblock=False, ) self.logger.debug("Thumbnail sent to display") await self.unblock(f"thumbnail_{page_number}") From 4dd6f6e272b81d04642ec2871c697cd0a78dbf7d Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Wed, 25 Feb 2026 12:18:43 +0000 Subject: [PATCH 15/17] Add auto_unblock parameter to write assertion --- tests/communicator_test.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/communicator_test.py b/tests/communicator_test.py index 5767852..7fc32fa 100644 --- a/tests/communicator_test.py +++ b/tests/communicator_test.py @@ -29,4 +29,8 @@ async def test_navigate(communicator): await communicator.navigate_to("1") # navigate_to blocks other writes with a blocked_key="__nav__" - communicator.write.assert_awaited_once_with("page 1", blocked_key="__nav__") + communicator.write.assert_awaited_once_with( + "page 1", + blocked_key="__nav__", + auto_unblock=False, + ) From 32d30771b5e357dd7d15c664a1c51bf60d2ac445 Mon Sep 17 00:00:00 2001 From: HalfManBear <89969229+halfmanbear@users.noreply.github.com> Date: Wed, 25 Feb 2026 12:27:48 +0000 Subject: [PATCH 16/17] Simplify pending request cancellation on close Refactor closing logic to simplify pending request handling. --- display.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/display.py b/display.py index e59fb5c..d0e763b 100644 --- a/display.py +++ b/display.py @@ -2020,10 +2020,10 @@ async def close(self): if not self.connected: return self.connected = False - - # Cancel all pending request futures + + # Cancel all pending request futures async with self.pending_reqs_lock: - for req_id, (fut, _) in list(self.pending_reqs.items()): + for fut, _ in list(self.pending_reqs.values()): if not fut.done(): fut.set_exception(ConnectionError("Connection closed")) self.pending_reqs.clear() From bc40d22f482ccbc88ddd1eac384fb00835d2b09b Mon Sep 17 00:00:00 2001 From: LinFor Date: Wed, 25 Feb 2026 15:37:18 +0300 Subject: [PATCH 17/17] Refactor display positions and text for screw levels (#59) --- src/elegoo_display.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/src/elegoo_display.py b/src/elegoo_display.py index f4b37c9..5c5a020 100644 --- a/src/elegoo_display.py +++ b/src/elegoo_display.py @@ -655,34 +655,27 @@ async def draw_completed_screw_leveling(self, screw_levels): await self.write("vis b[7],0") await self.write("vis b[8],1") await self.write("fill 0,110,320,290,10665") + await self.write('xstr 12,320,100,20,1,65535,10665,1,1,1,"front left"') await self.draw_screw_level_info_at("12,340,100,20", screw_levels["front left"]) await self.write('xstr 170,320,100,20,1,65535,10665,1,1,1,"front right"') - await self.draw_screw_level_info_at( - "170,340,100,20", screw_levels["front right"] - ) - - await self.write('xstr 170,120,100,20,1,65535,10665,1,1,1,"rear right"') - await self.draw_screw_level_info_at( - "170,140,100,20", screw_levels["rear right"] - ) + await self.draw_screw_level_info_at("170,340,100,20", screw_levels["front right"]) await self.write('xstr 12,120,100,20,1,65535,10665,1,1,1,"rear left"') await self.draw_screw_level_info_at("12,140,100,20", screw_levels["rear left"]) + await self.write('xstr 170,120,100,20,1,65535,10665,1,1,1,"rear right"') + await self.draw_screw_level_info_at("170,140,100,20", screw_levels["rear right"]) + if "center right" in screw_levels: - await self.write('xstr 12,220,100,30,1,65535,10665,1,1,1,"center\\rright"') - await self.draw_screw_level_info_at( - "170,240,100,20", screw_levels["center right"] - ) + await self.write('xstr 172,220,100,20,1,65535,10665,1,1,1,"center right"') + await self.draw_screw_level_info_at("172,240,100,20", screw_levels["center right"]) if "center left" in screw_levels: - await self.write('xstr 12,120,100,20,1,65535,10665,1,1,1,"center\\rleft"') - await self.draw_screw_level_info_at( - "12,240,100,20", screw_levels["center left"] - ) + await self.write('xstr 0,220,100,20,1,65535,10665,1,1,1,"center left"') + await self.draw_screw_level_info_at("0,240,100,20", screw_levels["center left"]) - await self.write('xstr 96,215,100,50,1,65535,15319,1,1,1,"Retry"') + await self.write('xstr 106,214,60,60,1,65535,15319,1,1,1,"Retry"') async def draw_screw_level_info_at(self, position, level): if level == "base":