Skip to content

Commit 373efef

Browse files
committed
Admin/XMover: Add problematic translog detection and cancellation
The new command can find and cancel replica shards with large uncommitted translog sizes. It provides options to set size thresholds and optionally execute cancel commands after user confirmation.
1 parent 256ff3d commit 373efef

File tree

3 files changed

+212
-1
lines changed

3 files changed

+212
-1
lines changed

cratedb_toolkit/admin/xmover/analysis/shard.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -838,6 +838,108 @@ def plan_node_decommission(self, node_name: str, min_free_space_gb: float = 100.
838838
}
839839

840840

841+
class TranslogReporter:
842+
def __init__(self, client: CrateDBClient):
843+
self.client = client
844+
845+
def problematic_translogs(self, size_mb: int) -> List[str]:
846+
"""Find and optionally cancel shards with problematic translog sizes."""
847+
console.print(Panel.fit("[bold blue]Problematic Translog Analysis[/bold blue]"))
848+
console.print(f"[dim]Looking for replica shards with translog uncommitted size > {size_mb}MB[/dim]")
849+
console.print()
850+
851+
# Query to find problematic replica shards
852+
query = """
853+
SELECT sh.schema_name, \
854+
sh.table_name, \
855+
translate(p.values::text, ':{}', '=()') as partition_values, \
856+
sh.id AS shard_id, \
857+
node['name'] as node_name, \
858+
sh.translog_stats['uncommitted_size'] / 1024^2 AS translog_uncommitted_mb
859+
FROM
860+
sys.shards AS sh
861+
LEFT JOIN information_schema.table_partitions p
862+
ON sh.table_name = p.table_name
863+
AND sh.schema_name = p.table_schema
864+
AND sh.partition_ident = p.partition_ident
865+
WHERE
866+
sh.state = 'STARTED'
867+
AND sh.translog_stats['uncommitted_size'] \
868+
> ? * 1024^2
869+
AND primary = FALSE
870+
ORDER BY
871+
6 DESC \
872+
"""
873+
874+
try:
875+
result = self.client.execute_query(query, [size_mb])
876+
rows = result.get("rows", [])
877+
878+
if not rows:
879+
console.print(f"[green]✓ No replica shards found with translog uncommitted size > {size_mb}MB[/green]")
880+
return None
881+
882+
console.print(f"Found {len(rows)} shards with problematic translogs:")
883+
console.print()
884+
885+
# Display query results table
886+
from rich.table import Table
887+
888+
results_table = Table(title=f"Problematic Replica Shards (translog > {size_mb}MB)", box=box.ROUNDED)
889+
results_table.add_column("Schema", style="cyan")
890+
results_table.add_column("Table", style="blue")
891+
results_table.add_column("Partition", style="magenta")
892+
results_table.add_column("Shard ID", justify="right", style="yellow")
893+
results_table.add_column("Node", style="green")
894+
results_table.add_column("Translog MB", justify="right", style="red")
895+
896+
for row in rows:
897+
schema_name, table_name, partition_values, shard_id, node_name, translog_mb = row
898+
partition_display = (
899+
partition_values if partition_values and partition_values != "NULL" else "[dim]none[/dim]"
900+
)
901+
results_table.add_row(
902+
schema_name, table_name, partition_display, str(shard_id), node_name, f"{translog_mb:.1f}"
903+
)
904+
905+
console.print(results_table)
906+
console.print()
907+
console.print("[bold]Generated ALTER Commands:[/bold]")
908+
console.print()
909+
910+
# Generate ALTER commands
911+
alter_commands = []
912+
for row in rows:
913+
schema_name, table_name, partition_values, shard_id, node_name, translog_mb = row
914+
915+
# Build the ALTER command based on whether it's partitioned
916+
if partition_values and partition_values != "NULL":
917+
# partition_values already formatted like ("sync_day"=1757376000000) from the translate function
918+
alter_cmd = (
919+
f'ALTER TABLE "{schema_name}"."{table_name}" partition {partition_values} '
920+
f"REROUTE CANCEL SHARD {shard_id} on '{node_name}' WITH (allow_primary=False);"
921+
)
922+
else:
923+
alter_cmd = (
924+
f'ALTER TABLE "{schema_name}"."{table_name}" '
925+
f"REROUTE CANCEL SHARD {shard_id} on '{node_name}' WITH (allow_primary=False);"
926+
)
927+
928+
alter_commands.append(alter_cmd)
929+
console.print(alter_cmd)
930+
931+
console.print()
932+
console.print(f"[bold]Total: {len(alter_commands)} ALTER commands generated[/bold]")
933+
return alter_commands
934+
935+
except Exception as e:
936+
console.print(f"[red]Error analyzing problematic translogs: {e}[/red]")
937+
import traceback
938+
939+
console.print(f"[dim]{traceback.format_exc()}[/dim]")
940+
return []
941+
942+
841943
class ShardReporter:
842944
def __init__(self, analyzer: ShardAnalyzer):
843945
self.analyzer = analyzer

cratedb_toolkit/admin/xmover/cli.py

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@
1212
from rich.console import Console
1313
from rich.panel import Panel
1414

15-
from cratedb_toolkit.admin.xmover.analysis.shard import ActiveShardMonitor, ShardAnalyzer, ShardReporter
15+
from cratedb_toolkit.admin.xmover.analysis.shard import (
16+
ActiveShardMonitor,
17+
ShardAnalyzer,
18+
ShardReporter,
19+
TranslogReporter,
20+
)
1621
from cratedb_toolkit.admin.xmover.analysis.table import DistributionAnalyzer
1722
from cratedb_toolkit.admin.xmover.analysis.zone import ZoneReport
1823
from cratedb_toolkit.admin.xmover.model import (
@@ -444,6 +449,69 @@ def run_single_analysis():
444449
console.print(f"[dim]{traceback.format_exc()}[/dim]")
445450

446451

452+
@main.command()
453+
@click.option("--size-mb", default=300, help="Minimum translog uncommitted size in MB (default: 300)")
454+
@click.option("--cancel", is_flag=True, help="Execute the cancel commands after confirmation")
455+
@click.pass_context
456+
def problematic_translogs(ctx, size_mb: int, cancel: bool):
457+
"""
458+
Find and optionally cancel shards with problematic translog sizes.
459+
460+
This command identifies replica shards with large uncommitted translog sizes
461+
that may indicate replication issues. By default, it shows the ALTER commands
462+
that would cancel these shards. With --cancel, it executes them after confirmation.
463+
"""
464+
client = ctx.obj["client"]
465+
report = TranslogReporter(client=client)
466+
alter_commands = report.problematic_translogs(size_mb=size_mb)
467+
468+
try:
469+
if cancel and alter_commands:
470+
console.print()
471+
console.print("[yellow]⚠️ WARNING: This will cancel the specified shards![/yellow]")
472+
console.print("[yellow]This may cause temporary data unavailability for these shards.[/yellow]")
473+
console.print()
474+
475+
if click.confirm("Are you sure you want to execute these ALTER commands?"):
476+
console.print()
477+
console.print("[bold blue]Executing ALTER commands...[/bold blue]")
478+
479+
executed = 0
480+
failed = 0
481+
482+
for i, cmd in enumerate(alter_commands, 1):
483+
if cmd.startswith("--"):
484+
console.print(f"[yellow]Skipping command {i} (parse error): {cmd}[/yellow]")
485+
continue
486+
487+
try:
488+
console.print(f"[dim]({i}/{len(alter_commands)}) Executing...[/dim]")
489+
client.execute_query(cmd)
490+
console.print(f"[green]✓ Command {i} executed successfully[/green]")
491+
executed += 1
492+
except Exception as e:
493+
console.print(f"[red]✗ Command {i} failed: {e}[/red]")
494+
failed += 1
495+
496+
# Small delay between commands to avoid overwhelming the cluster
497+
if i < len(alter_commands):
498+
time.sleep(1)
499+
500+
console.print()
501+
console.print("[bold]Execution Summary:[/bold]")
502+
console.print(f"[green]✓ Successful: {executed}[/green]")
503+
if failed > 0:
504+
console.print(f"[red]✗ Failed: {failed}[/red]")
505+
else:
506+
console.print("[yellow]Operation cancelled by user[/yellow]")
507+
508+
except Exception as e:
509+
console.print(f"[red]Error analyzing problematic translogs: {e}[/red]")
510+
import traceback
511+
512+
console.print(f"[dim]{traceback.format_exc()}[/dim]")
513+
514+
447515
@main.command()
448516
@click.option("--table", "-t", help="Analyze zones for specific table only")
449517
@click.option("--show-shards/--no-show-shards", default=False, help="Show individual shard details (default: False)")

doc/admin/xmover/handbook.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,47 @@ xmover monitor-recovery --watch --include-transitioning
245245
- **DISK**: Rebuilding shard from local data (after restart/disk issues)
246246

247247

248+
### `problematic-translogs`
249+
Find and optionally cancel replica shards with problematic translog sizes.
250+
251+
**Options:**
252+
- `--size-mb INTEGER`: Minimum translog uncommitted size in MB (default: 300)
253+
- `--cancel`: Execute the cancel commands after confirmation
254+
255+
**Description:**
256+
This command identifies replica shards with large uncommitted translog sizes that may indicate replication issues. By default, it shows the ALTER commands that would cancel these shards. With `--cancel`, it executes them after confirmation.
257+
258+
**Examples:**
259+
```bash
260+
# Show problematic shards with translog > 300MB (default)
261+
xmover problematic-translogs
262+
263+
# Show shards with translog > 500MB
264+
xmover problematic-translogs --size-mb 500
265+
266+
# Execute cancel commands for shards > 1GB after confirmation
267+
xmover problematic-translogs --size-mb 1000 --cancel
268+
```
269+
270+
**Sample Output:**
271+
```
272+
Found 3 shards with problematic translogs:
273+
Problematic Replica Shards (translog > 300MB)
274+
╭────────┬───────────────────────────────┬────────────────────────────┬──────────┬────────────┬─────────────╮
275+
│ Schema │ Table │ Partition │ Shard ID │ Node │ Translog MB │
276+
├────────┼───────────────────────────────┼────────────────────────────┼──────────┼────────────┼─────────────┤
277+
│ TURVO │ shipmentFormFieldData │ none │ 14 │ data-hot-6 │ 7040.9 │
278+
│ TURVO │ shipmentFormFieldData_events │ ("sync_day"=1757376000000) │ 3 │ data-hot-2 │ 481.2 │
279+
│ TURVO │ orderFormFieldData │ none │ 5 │ data-hot-1 │ 469.5 │
280+
╰────────┴───────────────────────────────┴────────────────────────────┴──────────┴────────────┴─────────────╯
281+
Generated ALTER Commands:
282+
ALTER TABLE "TURVO"."shipmentFormFieldData" REROUTE CANCEL SHARD 14 on 'data-hot-6' WITH (allow_primary=False);
283+
ALTER TABLE "TURVO"."shipmentFormFieldData_events" partition ("sync_day"=1757376000000) REROUTE CANCEL SHARD 3 on 'data-hot-2' WITH (allow_primary=False);
284+
ALTER TABLE "TURVO"."orderFormFieldData" REROUTE CANCEL SHARD 5 on 'data-hot-1' WITH (allow_primary=False);
285+
Total: 3 ALTER commands generated
286+
```
287+
288+
248289
### `active-shards`
249290
Monitor the most active shards by tracking checkpoint progression over time.
250291
This command helps identify which shards are receiving the most write activity

0 commit comments

Comments
 (0)