From a8fdcba3015c21ba0ccfdb4abf08a968cd0b4d3e Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Sun, 15 Feb 2026 20:25:53 +0800 Subject: [PATCH 1/2] Fix checkpoint conversion failing on 20-layer model with 64 GPUs The converter asserts world_size <= num_layers, which fails when using all 8 nodes (64 GPUs) for a 20-layer model. Add num_nodes parameter to exec_command_all_ray_node and convert_checkpoint so callers can limit the number of nodes used. For DeepSeek-V3-0324-20layer, use only 2 nodes (16 GPUs) for conversion. --- miles/utils/external_utils/command_utils.py | 7 ++++++- miles/utils/misc.py | 15 +++++++++++++-- scripts/run_deepseek.py | 3 +++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/miles/utils/external_utils/command_utils.py b/miles/utils/external_utils/command_utils.py index 0de5def17..697bac640 100644 --- a/miles/utils/external_utils/command_utils.py +++ b/miles/utils/external_utils/command_utils.py @@ -8,6 +8,7 @@ import random import time from dataclasses import dataclass +from functools import partial from pathlib import Path from miles.utils.misc import exec_command, exec_command_all_ray_node @@ -23,6 +24,7 @@ def convert_checkpoint( megatron_model_type, num_gpus_per_node: int, multinode: bool = False, + num_nodes: int | None = None, extra_args: str = "", dir_dst: str = "/root", hf_checkpoint: str | None = None, @@ -43,7 +45,10 @@ def convert_checkpoint( "--master-addr {{master_addr}} " "--master-port 23456 " "--nnodes={{nnodes}} " "--node-rank {{node_rank}} " ) - fn = exec_command_all_ray_node if multinode else exec_command + if multinode: + fn = partial(exec_command_all_ray_node, num_nodes=num_nodes) + else: + fn = exec_command fn( f"source {repo_base_dir}/scripts/models/{megatron_model_type}.sh && " f"PYTHONPATH={megatron_path} " diff --git a/miles/utils/misc.py b/miles/utils/misc.py index f6ce467c1..15dc63649 100644 --- a/miles/utils/misc.py +++ b/miles/utils/misc.py @@ -101,14 +101,19 @@ def _exec_command_on_node(cmd: str, capture_output: bool) -> str | None: return exec_command(f"unset CUDA_VISIBLE_DEVICES; {cmd}", capture_output=capture_output) -def exec_command_all_ray_node(cmd: str, capture_output: bool = False) -> list[str | None]: +def exec_command_all_ray_node( + cmd: str, capture_output: bool = False, num_nodes: int | None = None +) -> list[str | None]: """Execute a shell command on every alive Ray node in parallel. Supported placeholders in `cmd` (replaced per-node before execution): {{node_rank}} - 0-based index of the node - {{nnodes}} - total number of alive nodes + {{nnodes}} - total number of alive nodes (or num_nodes if specified) {{master_addr}} - NodeManagerAddress of the first node {{node_ip}} - NodeManagerAddress of the current node + + Args: + num_nodes: If set, only use the first `num_nodes` nodes instead of all alive nodes. """ ray.init(address="auto") try: @@ -119,6 +124,12 @@ def exec_command_all_ray_node(cmd: str, capture_output: bool = False) -> list[st ) assert len(nodes) > 0 + if num_nodes is not None: + assert num_nodes <= len(nodes), ( + f"Requested {num_nodes} nodes but only {len(nodes)} alive nodes available." + ) + nodes = nodes[:num_nodes] + master_addr = nodes[0]["NodeManagerAddress"] nnodes = str(len(nodes)) diff --git a/scripts/run_deepseek.py b/scripts/run_deepseek.py index a75401612..ab6e1e652 100644 --- a/scripts/run_deepseek.py +++ b/scripts/run_deepseek.py @@ -60,6 +60,7 @@ def _prepare_megatron_ckpt(args: ScriptArgs): extra_args = "--tensor-model-parallel-size 1 " "--expert-tensor-parallel-size 1 " num_gpus_per_node = args.num_gpus_per_node multinode = True + num_nodes = None if args.model_name == "DeepSeek-V3-0324-5layer": extra_args += "--pipeline-model-parallel-size 1 " "--expert-model-parallel-size 1 " num_gpus_per_node = min(4, num_gpus_per_node) @@ -69,6 +70,7 @@ def _prepare_megatron_ckpt(args: ScriptArgs): "--expert-model-parallel-size 4 " # PP info will be auto determined by converter script ) + num_nodes = 2 else: extra_args += ( "--pipeline-model-parallel-size 8 " @@ -83,6 +85,7 @@ def _prepare_megatron_ckpt(args: ScriptArgs): megatron_model_type=args.megatron_model_type, num_gpus_per_node=num_gpus_per_node, multinode=multinode, + num_nodes=num_nodes, extra_args=extra_args, dir_dst=args.model_dir, megatron_path=args.megatron_path, From d59ad8b1f29a89660ebfc95678000e610e7e1350 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Sun, 15 Feb 2026 22:32:57 +0800 Subject: [PATCH 2/2] fmt --- miles/utils/misc.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/miles/utils/misc.py b/miles/utils/misc.py index 15dc63649..313a60dda 100644 --- a/miles/utils/misc.py +++ b/miles/utils/misc.py @@ -125,9 +125,7 @@ def exec_command_all_ray_node( assert len(nodes) > 0 if num_nodes is not None: - assert num_nodes <= len(nodes), ( - f"Requested {num_nodes} nodes but only {len(nodes)} alive nodes available." - ) + assert num_nodes <= len(nodes), f"Requested {num_nodes} nodes but only {len(nodes)} alive nodes available." nodes = nodes[:num_nodes] master_addr = nodes[0]["NodeManagerAddress"]