diff --git a/models/modeling_llama_opt.py b/models/modeling_llama_opt.py index 9dc69bc..a1adbae 100644 --- a/models/modeling_llama_opt.py +++ b/models/modeling_llama_opt.py @@ -1335,13 +1335,6 @@ def forward( # embed positions hidden_states = inputs_embeds - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None diff --git a/run_clm.sh b/run_clm.sh index ebe6c32..0ca1859 100644 --- a/run_clm.sh +++ b/run_clm.sh @@ -18,6 +18,7 @@ export LCKV_FUSED_SWIGLU=1 # - to pretrain a tinyllama, change the config to `TinyLlama/TinyLlama-1.1B-intermediate-step-955k-token-2T` # - to intialize the model with a pretrained model, add `--model_name_or_path TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T` # - to use the minipile dataset, use `--dataset_name JeanKaddour/minipile`, with proper `--preprocessing_num_workers` +# - to use gradient checkpointing, add `--gradient_checkpointing` # - to enable wandb, use `--report_to wandb` accelerate launch run_clm.py \ --tokenizer_name TinyLlama/TinyLlama-1.1B-intermediate-step-955k-token-2T \