From 75db6f167fc88149ddb819a3c97ea13c3a08cd79 Mon Sep 17 00:00:00 2001 From: FengweiZhang Date: Mon, 1 Dec 2025 07:04:58 +0000 Subject: [PATCH 1/6] [Bug] fix installation and tokenizer --- applications/DeepSpeed-Chat/dschat/__init__.py | 0 applications/DeepSpeed-Chat/dschat/rlhf/__init__.py | 0 applications/DeepSpeed-Chat/dschat/utils/__init__.py | 0 applications/DeepSpeed-Chat/dschat/utils/data/__init__.py | 0 applications/DeepSpeed-Chat/dschat/utils/model/__init__.py | 0 applications/DeepSpeed-Chat/dschat/utils/module/__init__.py | 0 applications/DeepSpeed-Chat/dschat/utils/utils.py | 4 ++-- 7 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 applications/DeepSpeed-Chat/dschat/__init__.py create mode 100644 applications/DeepSpeed-Chat/dschat/rlhf/__init__.py create mode 100644 applications/DeepSpeed-Chat/dschat/utils/__init__.py create mode 100644 applications/DeepSpeed-Chat/dschat/utils/data/__init__.py create mode 100644 applications/DeepSpeed-Chat/dschat/utils/model/__init__.py create mode 100644 applications/DeepSpeed-Chat/dschat/utils/module/__init__.py diff --git a/applications/DeepSpeed-Chat/dschat/__init__.py b/applications/DeepSpeed-Chat/dschat/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/applications/DeepSpeed-Chat/dschat/rlhf/__init__.py b/applications/DeepSpeed-Chat/dschat/rlhf/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/applications/DeepSpeed-Chat/dschat/utils/__init__.py b/applications/DeepSpeed-Chat/dschat/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/applications/DeepSpeed-Chat/dschat/utils/data/__init__.py b/applications/DeepSpeed-Chat/dschat/utils/data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/applications/DeepSpeed-Chat/dschat/utils/model/__init__.py b/applications/DeepSpeed-Chat/dschat/utils/model/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/applications/DeepSpeed-Chat/dschat/utils/module/__init__.py b/applications/DeepSpeed-Chat/dschat/utils/module/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/applications/DeepSpeed-Chat/dschat/utils/utils.py b/applications/DeepSpeed-Chat/dschat/utils/utils.py index e4dc7d036..56138e834 100644 --- a/applications/DeepSpeed-Chat/dschat/utils/utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/utils.py @@ -74,8 +74,8 @@ def get(self): def get_tokenizer(model_name_or_path, fast_tokenizer=True): if "llama" in model_name_or_path: - from transformers.models.llama import LlamaTokenizer - tokenizer = LlamaTokenizer.from_pretrained( + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained( model_name_or_path, fast_tokenizer=fast_tokenizer) if tokenizer.pad_token is None: # assert tokenizer.eos_token is not None From 3f102114fb26b98df50eb5287912fbdb38adeaa0 Mon Sep 17 00:00:00 2001 From: FengweiZhang Date: Wed, 3 Dec 2025 08:36:26 +0000 Subject: [PATCH 2/6] [test] support nvme offload in llama2 lora --- .../DeepSpeed-Chat/dschat/utils/ds_utils.py | 5 +- .../step1_supervised_finetuning/main.py | 72 ++++++++++++++++++- .../llama2/run_llama2_7b_lora.sh | 55 ++++++++------ 3 files changed, 107 insertions(+), 25 deletions(-) diff --git a/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py b/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py index 0cf1c28ab..dc659b09a 100644 --- a/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py @@ -22,7 +22,8 @@ def get_train_ds_config(offload, enable_tensorboard=False, enable_mixed_precision_lora=False, tb_path="", - tb_name=""): + tb_name="", + offload_optimizer_config=None): device = "cpu" if offload else "none" if dtype == "fp16": @@ -45,6 +46,8 @@ def get_train_ds_config(offload, "stage3_prefetch_bucket_size": 3e7, "memory_efficient_linear": False } + if offload_optimizer_config: + zero_opt_dict["offload_optimizer"].update(offload_optimizer_config) if enable_mixed_precision_lora: zero_opt_dict["zero_quantized_nontrainable_weights"] = True if dist.get_world_size() != get_accelerator().device_count(): diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py index aa505a25d..4b197a2da 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py @@ -5,6 +5,7 @@ # DeepSpeed Team import argparse import math +from pprint import pformat import torch from torch.utils.data import DataLoader, RandomSampler, SequentialSampler @@ -29,6 +30,18 @@ from dschat.utils.perf import print_throughput +def str2bool(value): + if isinstance(value, bool): + return value + lowered = value.lower() + if lowered in ("yes", "true", "t", "1"): + return True + if lowered in ("no", "false", "f", "0"): + return False + raise argparse.ArgumentTypeError( + f"Boolean value expected, got `{value}`.") + + def parse_args(): parser = argparse.ArgumentParser( description= @@ -145,6 +158,31 @@ def parse_args(): parser.add_argument('--offload', action='store_true', help='Enable ZeRO Offload techniques.') + parser.add_argument('--offload_optimizer_device', + type=str, + choices=['cpu', 'nvme'], + default=None, + help='Device to use for ZeRO optimizer state offload.') + parser.add_argument('--offload_optimizer_nvme_path', + type=str, + default=None, + help='NVMe path used when offloading optimizer states to nvme.') + parser.add_argument('--offload_optimizer_pin_memory', + type=str2bool, + default=None, + help='Whether to pin optimizer offload memory (true|false).') + parser.add_argument('--offload_optimizer_ratio', + type=float, + default=None, + help='Ratio of optimizer state to keep on device when offloading.') + parser.add_argument('--offload_optimizer_buffer_count', + type=int, + default=None, + help='Number of optimizer offload buffers.') + parser.add_argument('--offload_optimizer_fast_init', + type=str2bool, + default=None, + help='Use fast init for optimizer offload buffers (true|false).') parser.add_argument('--dtype', type=str, default='fp16', @@ -222,18 +260,39 @@ def main(): args.global_rank = torch.distributed.get_rank() + offload_optimizer_overrides = { + "device": args.offload_optimizer_device, + "nvme_path": args.offload_optimizer_nvme_path, + "pin_memory": args.offload_optimizer_pin_memory, + "ratio": args.offload_optimizer_ratio, + "buffer_count": args.offload_optimizer_buffer_count, + "fast_init": args.offload_optimizer_fast_init + } + offload_optimizer_overrides = { + key: value + for key, value in offload_optimizer_overrides.items() + if value is not None + } ds_config = get_train_ds_config(offload=args.offload, dtype=args.dtype, stage=args.zero_stage, enable_tensorboard=args.enable_tensorboard, tb_path=args.tensorboard_path, - tb_name="step1_model") + tb_name="step1_model", + offload_optimizer_config=( + offload_optimizer_overrides + if offload_optimizer_overrides else None)) ds_config[ 'train_micro_batch_size_per_gpu'] = args.per_device_train_batch_size ds_config[ 'train_batch_size'] = args.per_device_train_batch_size * torch.distributed.get_world_size( ) * args.gradient_accumulation_steps + + # It seems that ds_config is completed here, so we print configuration here + print_rank_0("***** DeepSpeed config *****", args.global_rank) + print_rank_0(pformat(ds_config), args.global_rank) + # If passed along, set the training seed now. set_random_seed(args.seed) @@ -319,6 +378,7 @@ def evaluation(model, eval_dataloader): model, args.weight_decay, args.lora_learning_rate) AdamOptimizer = DeepSpeedCPUAdam if args.offload else FusedAdam + print_rank_0(f"offload: {args.offload}", args.global_rank) optimizer = AdamOptimizer(optimizer_grouped_parameters, lr=args.learning_rate, betas=(0.9, 0.95)) @@ -348,8 +408,9 @@ def evaluation(model, eval_dataloader): print_rank_0( f"***** Evaluating perplexity, Epoch {0}/{args.num_train_epochs} *****", args.global_rank) - perplexity, eval_loss = evaluation(model, eval_dataloader) - print_rank_0(f"ppl: {perplexity}, loss: {eval_loss}", args.global_rank) + print_rank_0("Jump Evaluation", args.global_rank) + # perplexity, eval_loss = evaluation(model, eval_dataloader) + # print_rank_0(f"ppl: {perplexity}, loss: {eval_loss}", args.global_rank) for epoch in range(args.num_train_epochs): print_rank_0( @@ -372,6 +433,11 @@ def evaluation(model, eval_dataloader): if torch.distributed.get_rank() == 0: print_throughput(model.model, args, end - start, args.global_rank) + + # return for debugging + if step > 20: + return 0 + # Evaluate perplexity on the validation set. print_rank_0( diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh index 7689266ee..c9c7d59f7 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh @@ -13,24 +13,37 @@ if [ "$ZERO_STAGE" == "" ]; then fi mkdir -p $OUTPUT -deepspeed main.py \ - --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ - --data_split 2,4,4 \ - --model_name_or_path meta-llama/Llama-2-7b-hf \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --max_seq_len 512 \ - --learning_rate 9.65e-6 \ - --weight_decay 0. \ - --num_train_epochs 4 \ - --gradient_accumulation_steps 1 \ - --lr_scheduler_type cosine \ - --num_warmup_steps 0 \ - --seed 1234 \ - --gradient_checkpointing \ - --zero_stage $ZERO_STAGE \ - --deepspeed \ - --lora_dim 128 \ - --lora_module_name "layers." \ - --output_dir $OUTPUT \ - &> $OUTPUT/training.log +# nsys profile --output=nsight_reports/llama2_mem \ +deepspeed --master_port=29600 main.py \ + --offload \ + --offload_optimizer_device nvme \ + --offload_optimizer_nvme_path /mnt/nvme2/deepspeed \ + --offload_optimizer_pin_memory true \ + --offload_optimizer_ratio 0.3 \ + --offload_optimizer_buffer_count 4 \ + --offload_optimizer_fast_init false \ + --dtype bf16 \ + --data_path Dahoas/rm-static \ + --data_split 2,4,4 \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --max_seq_len 512 \ + --learning_rate 9.65e-6 \ + --weight_decay 0. \ + --num_train_epochs 4 \ + --gradient_accumulation_steps 1 \ + --lr_scheduler_type cosine \ + --num_warmup_steps 0 \ + --seed 1234 \ + --gradient_checkpointing \ + --zero_stage $ZERO_STAGE \ + --deepspeed \ + --lora_dim 128 \ + --lora_module_name "layers." \ + --data_output_path /tmp/data_files2 \ + --output_dir $OUTPUT + + + +# &> $OUTPUT/training.log From 98c59b0fcc5ee0112d120b1d99e551b827eff4f4 Mon Sep 17 00:00:00 2001 From: FengweiZhang Date: Sun, 7 Dec 2025 04:56:37 +0000 Subject: [PATCH 3/6] [test] add aio config --- .../DeepSpeed-Chat/dschat/utils/ds_utils.py | 11 ++- .../step1_supervised_finetuning/main.py | 80 ++++++++++++++++++- .../llama2/run_llama2_7b_lora.sh | 18 +++-- 3 files changed, 97 insertions(+), 12 deletions(-) diff --git a/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py b/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py index dc659b09a..e89f701cc 100644 --- a/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py @@ -23,7 +23,9 @@ def get_train_ds_config(offload, enable_mixed_precision_lora=False, tb_path="", tb_name="", - offload_optimizer_config=None): + offload_optimizer_config=None, + offload_param_config=None, + aio_config=None): device = "cpu" if offload else "none" if dtype == "fp16": @@ -48,12 +50,14 @@ def get_train_ds_config(offload, } if offload_optimizer_config: zero_opt_dict["offload_optimizer"].update(offload_optimizer_config) + if offload_param_config: + zero_opt_dict["offload_param"].update(offload_param_config) if enable_mixed_precision_lora: zero_opt_dict["zero_quantized_nontrainable_weights"] = True if dist.get_world_size() != get_accelerator().device_count(): zero_opt_dict["zero_hpz_partition_size"] = get_accelerator( ).device_count() - return { + config = { "train_batch_size": GLOBAL_BATCH_SIZE, "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE, "steps_per_print": 10, @@ -76,6 +80,9 @@ def get_train_ds_config(offload, "job_name": f"{tb_name}_tensorboard" } } + if aio_config: + config["aio"] = aio_config + return config def get_eval_ds_config(offload, dtype, stage=0): diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py index 4b197a2da..8d1b1ede7 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py @@ -183,6 +183,51 @@ def parse_args(): type=str2bool, default=None, help='Use fast init for optimizer offload buffers (true|false).') + parser.add_argument('--offload_param_device', + type=str, + choices=['cpu', 'nvme'], + default=None, + help='Device to use for ZeRO parameter offload.') + parser.add_argument('--offload_param_nvme_path', + type=str, + default=None, + help='NVMe path used when offloading parameters to nvme.') + parser.add_argument('--offload_param_pin_memory', + type=str2bool, + default=None, + help='Whether to pin parameter offload memory (true|false).') + parser.add_argument('--offload_param_buffer_size', + type=int, + default=None, + help='Parameter offload buffer size (number of elements). Increase if embedding layer is larger than the default.') + parser.add_argument('--offload_param_buffer_count', + type=int, + default=None, + help='Number of parameter offload buffers.') + parser.add_argument('--aio_block_size', + type=int, + default=1048576, + help='AIO block size for NVMe offload (bytes).') + parser.add_argument('--aio_queue_depth', + type=int, + default=8, + help='AIO queue depth for NVMe offload.') + parser.add_argument('--aio_intra_op_parallelism', + type=int, + default=1, + help='AIO intra_op_parallelism for NVMe offload.') + parser.add_argument('--aio_single_submit', + type=str2bool, + default=False, + help='AIO single_submit flag.') + parser.add_argument('--aio_overlap_events', + type=str2bool, + default=True, + help='AIO overlap_events flag.') + parser.add_argument('--aio_use_gds', + type=str2bool, + default=False, + help='AIO use_gds flag.') parser.add_argument('--dtype', type=str, default='fp16', @@ -273,6 +318,26 @@ def main(): for key, value in offload_optimizer_overrides.items() if value is not None } + offload_param_overrides = { + "device": args.offload_param_device, + "nvme_path": args.offload_param_nvme_path, + "pin_memory": args.offload_param_pin_memory, + "buffer_size": args.offload_param_buffer_size, + "buffer_count": args.offload_param_buffer_count + } + offload_param_overrides = { + key: value + for key, value in offload_param_overrides.items() + if value is not None + } + aio_config = { + "block_size": args.aio_block_size, + "queue_depth": args.aio_queue_depth, + "intra_op_parallelism": args.aio_intra_op_parallelism, + "single_submit": args.aio_single_submit, + "overlap_events": args.aio_overlap_events, + "use_gds": args.aio_use_gds, + } ds_config = get_train_ds_config(offload=args.offload, dtype=args.dtype, stage=args.zero_stage, @@ -281,7 +346,11 @@ def main(): tb_name="step1_model", offload_optimizer_config=( offload_optimizer_overrides - if offload_optimizer_overrides else None)) + if offload_optimizer_overrides else None), + offload_param_config=( + offload_param_overrides + if offload_param_overrides else None), + aio_config=aio_config) ds_config[ 'train_micro_batch_size_per_gpu'] = args.per_device_train_batch_size ds_config[ @@ -290,7 +359,7 @@ def main(): # It seems that ds_config is completed here, so we print configuration here - print_rank_0("***** DeepSpeed config *****", args.global_rank) + print_rank_0("***** DeepSpeed User Provided config *****", args.global_rank) print_rank_0(pformat(ds_config), args.global_rank) # If passed along, set the training seed now. @@ -304,6 +373,9 @@ def main(): fast_tokenizer=True, add_special_tokens=additional_special_tokens) + print_rank_0("***** Tokenizer *****", args.global_rank) + print_rank_0(tokenizer, args.global_rank) + model = create_hf_model(AutoModelForCausalLM, args.model_name_or_path, tokenizer, @@ -323,6 +395,10 @@ def main(): model = only_optimize_lora_parameters(model) model = make_model_gradient_checkpointing_compatible(model) + # Print full model architecture (rank 0 only to avoid log spam) + print_rank_0("***** Model architecture *****", args.global_rank) + print_rank_0(model, args.global_rank) + # Prepare the data train_phase = 1 train_dataset, eval_dataset = create_prompt_dataset( diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh index c9c7d59f7..819ee5895 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh @@ -13,15 +13,20 @@ if [ "$ZERO_STAGE" == "" ]; then fi mkdir -p $OUTPUT -# nsys profile --output=nsight_reports/llama2_mem \ -deepspeed --master_port=29600 main.py \ +CUDA_VISIBLE_DEVICES=0 deepspeed --master_port=29600 main.py \ --offload \ --offload_optimizer_device nvme \ - --offload_optimizer_nvme_path /mnt/nvme2/deepspeed \ + --offload_optimizer_nvme_path /mnt/nvme0/deepspeed2 \ --offload_optimizer_pin_memory true \ --offload_optimizer_ratio 0.3 \ --offload_optimizer_buffer_count 4 \ --offload_optimizer_fast_init false \ + --offload_param_device nvme \ + --offload_param_nvme_path /mnt/nvme0/deepspeed2 \ + --offload_param_pin_memory true \ + --offload_param_buffer_size 200000000 \ + --offload_param_buffer_count 5 \ + --aio_use_gds true \ --dtype bf16 \ --data_path Dahoas/rm-static \ --data_split 2,4,4 \ @@ -42,8 +47,5 @@ deepspeed --master_port=29600 main.py \ --lora_dim 128 \ --lora_module_name "layers." \ --data_output_path /tmp/data_files2 \ - --output_dir $OUTPUT - - - -# &> $OUTPUT/training.log + --output_dir $OUTPUT \ + &> $OUTPUT/training.log From afb1e56b55779fb3696622a27140f7addc59f202 Mon Sep 17 00:00:00 2001 From: FengweiZhang Date: Sun, 7 Dec 2025 14:56:02 +0000 Subject: [PATCH 4/6] test: add parameter max_in_cpu config --- .../training/step1_supervised_finetuning/main.py | 7 ++++++- .../training_scripts/llama2/run_llama2_7b_lora.sh | 7 ++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py index 8d1b1ede7..44d99b76d 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py @@ -204,6 +204,10 @@ def parse_args(): type=int, default=None, help='Number of parameter offload buffers.') + parser.add_argument('--offload_param_max_in_cpu', + type=float, + default=None, + help='Maximum number of parameters to keep in CPU memory during offload.') parser.add_argument('--aio_block_size', type=int, default=1048576, @@ -323,7 +327,8 @@ def main(): "nvme_path": args.offload_param_nvme_path, "pin_memory": args.offload_param_pin_memory, "buffer_size": args.offload_param_buffer_size, - "buffer_count": args.offload_param_buffer_count + "buffer_count": args.offload_param_buffer_count, + "max_in_cpu": args.offload_param_max_in_cpu } offload_param_overrides = { key: value diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh index 819ee5895..367a3c2ab 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh @@ -19,13 +19,14 @@ CUDA_VISIBLE_DEVICES=0 deepspeed --master_port=29600 main.py \ --offload_optimizer_nvme_path /mnt/nvme0/deepspeed2 \ --offload_optimizer_pin_memory true \ --offload_optimizer_ratio 0.3 \ - --offload_optimizer_buffer_count 4 \ + --offload_optimizer_buffer_count 8 \ --offload_optimizer_fast_init false \ --offload_param_device nvme \ --offload_param_nvme_path /mnt/nvme0/deepspeed2 \ --offload_param_pin_memory true \ - --offload_param_buffer_size 200000000 \ - --offload_param_buffer_count 5 \ + --offload_param_buffer_size 199950336 \ + --offload_param_buffer_count 10 \ + --offload_param_max_in_cpu 0 \ --aio_use_gds true \ --dtype bf16 \ --data_path Dahoas/rm-static \ From 3b8976c89afb6a0d746db9c98ded74c408d63786 Mon Sep 17 00:00:00 2001 From: FengweiZhang Date: Tue, 9 Dec 2025 13:02:05 +0000 Subject: [PATCH 5/6] feat: geminifs parameter offload script --- .../training_scripts/llama2/run_llama2_7b_lora.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh index 367a3c2ab..6f69b5e9c 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh @@ -16,16 +16,16 @@ mkdir -p $OUTPUT CUDA_VISIBLE_DEVICES=0 deepspeed --master_port=29600 main.py \ --offload \ --offload_optimizer_device nvme \ - --offload_optimizer_nvme_path /mnt/nvme0/deepspeed2 \ + --offload_optimizer_nvme_path /mnt/nvme_deepspeed \ --offload_optimizer_pin_memory true \ --offload_optimizer_ratio 0.3 \ --offload_optimizer_buffer_count 8 \ --offload_optimizer_fast_init false \ --offload_param_device nvme \ - --offload_param_nvme_path /mnt/nvme0/deepspeed2 \ + --offload_param_nvme_path /mnt/nvme_deepspeed \ --offload_param_pin_memory true \ - --offload_param_buffer_size 199950336 \ - --offload_param_buffer_count 10 \ + --offload_param_buffer_size 134217728 \ + --offload_param_buffer_count 32 \ --offload_param_max_in_cpu 0 \ --aio_use_gds true \ --dtype bf16 \ From 494b321a06c097f95957c860af222b6f0d6b9a88 Mon Sep 17 00:00:00 2001 From: KairongChen Date: Tue, 9 Dec 2025 14:05:35 +0000 Subject: [PATCH 6/6] support loading local models. --- .../DeepSpeed-Chat/dschat/utils/utils.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/applications/DeepSpeed-Chat/dschat/utils/utils.py b/applications/DeepSpeed-Chat/dschat/utils/utils.py index 56138e834..167234d06 100644 --- a/applications/DeepSpeed-Chat/dschat/utils/utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/utils.py @@ -74,7 +74,6 @@ def get(self): def get_tokenizer(model_name_or_path, fast_tokenizer=True): if "llama" in model_name_or_path: - from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained( model_name_or_path, fast_tokenizer=fast_tokenizer) if tokenizer.pad_token is None: @@ -94,16 +93,13 @@ def get_tokenizer(model_name_or_path, fast_tokenizer=True): def load_hf_tokenizer(model_name_or_path, fast_tokenizer=True, add_special_tokens=None): - if os.path.exists(model_name_or_path): - # Locally tokenizer loading has some issue, so we need to force download - model_json = os.path.join(model_name_or_path, "config.json") - if os.path.exists(model_json): - model_json_file = json.load(open(model_json)) - model_name = model_json_file.get("_name_or_path", - model_name_or_path) - tokenizer = get_tokenizer(model_name, - fast_tokenizer=fast_tokenizer) + # Support loading from local path directly + if os.path.exists(model_name_or_path) and os.path.isdir(model_name_or_path): + # Directly load tokenizer from local path + tokenizer = get_tokenizer(model_name_or_path, + fast_tokenizer=fast_tokenizer) else: + # Load from HuggingFace Hub or use original logic tokenizer = get_tokenizer(model_name_or_path, fast_tokenizer=fast_tokenizer)