diff --git a/models/modeling_llama_opt.py b/models/modeling_llama_opt.py
index 9dc69bc..a1adbae 100644
--- a/models/modeling_llama_opt.py
+++ b/models/modeling_llama_opt.py
@@ -1335,13 +1335,6 @@ def forward(
         # embed positions
         hidden_states = inputs_embeds
 
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
diff --git a/run_clm.sh b/run_clm.sh
index ebe6c32..0ca1859 100644
--- a/run_clm.sh
+++ b/run_clm.sh
@@ -18,6 +18,7 @@ export LCKV_FUSED_SWIGLU=1
 #  - to pretrain a tinyllama, change the config to `TinyLlama/TinyLlama-1.1B-intermediate-step-955k-token-2T`
 #  - to intialize the model with a pretrained model, add `--model_name_or_path TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T`
 #  - to use the minipile dataset, use `--dataset_name JeanKaddour/minipile`, with proper `--preprocessing_num_workers`
+#  - to use gradient checkpointing, add `--gradient_checkpointing`
 #  - to enable wandb, use `--report_to wandb`
 accelerate launch run_clm.py \
     --tokenizer_name TinyLlama/TinyLlama-1.1B-intermediate-step-955k-token-2T \