diff --git a/README.md b/README.md index 7faf222..c4b86f4 100644 --- a/README.md +++ b/README.md @@ -52,10 +52,9 @@ Use `run_pretraining.py` to pre-train an ELECTRA model. It has the following arg If training is halted, re-running the `run_pretraining.py` with the same arguments will continue the training where it left off. You can continue pre-training from the released ELECTRA checkpoints by -1. Setting the model-name to point to a downloaded model (e.g., `--model-name electra_small` if you downloaded weights to `$DATA_DIR/electra_small`). -2. Setting `num_train_steps` by (for example) adding `"num_train_steps": 4010000` to the `--hparams`. This will continue training the small model for 10000 more steps (it has already been trained for 4e6 steps). -3. Increase the learning rate to account for the linear learning rate decay. For example, to start with a learning rate of 2e-4 you should set the `learning_rate` hparam to 2e-4 * (4e6 + 10000) / 10000. -4. For ELECTRA-Small, you also need to specifiy `"generator_hidden_size": 1.0` in the `hparams` because we did not use a small generator for that model. +1. Setting the model-name to point to a downloaded model (e.g., `--model-name electra_small`). +2. Specify an initial checkpoint (usually from a released ELECTRA checkpoint, e.g., `--init_checkpoint pretrained_models_dir/electra_small`) +3. For ELECTRA-Small, you also need to specifiy `"generator_hidden_size": 1.0` in the `hparams` because we did not use a small generator for that model. ## Quickstart: Pre-train a small ELECTRA model. These instructions pre-train a small ELECTRA model (12 layers, 256 hidden size). Unfortunately, the data we used in the paper is not publicly available, so we will use the [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/) released by Aaron Gokaslan and Vanya Cohen instead. The fully-trained model (~4 days on a v100 GPU) should perform roughly in between [GPT](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf) and BERT-Base in terms of GLUE performance. By default the model is trained on length-128 sequences, so it is not suitable for running on question answering. See the "expected results" section below for more details on model performance. diff --git a/configure_pretraining.py b/configure_pretraining.py index f576563..fb402e7 100644 --- a/configure_pretraining.py +++ b/configure_pretraining.py @@ -44,6 +44,7 @@ def __init__(self, model_name, data_dir, **kwargs): self.num_warmup_steps = 10000 # training settings + self.init_checkpoint = None self.iterations_per_loop = 200 self.save_checkpoints_steps = 1000 self.num_train_steps = 1000000 diff --git a/run_pretraining.py b/run_pretraining.py index 599c9f9..039d884 100644 --- a/run_pretraining.py +++ b/run_pretraining.py @@ -272,6 +272,35 @@ def model_fn(features, labels, mode, params): model = PretrainingModel(config, features, mode == tf.estimator.ModeKeys.TRAIN) utils.log("Model is built!") + + # Load pre-trained weights from checkpoint + tvars = tf.trainable_variables() + + init_checkpoint = tf.train.latest_checkpoint(config.init_checkpoint) + utils.log("Using checkpoint", init_checkpoint) + tvars = tf.trainable_variables() + + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + assignment_map, initialized_variable_names = modeling.get_assignment_map_from_checkpoint( + tvars, init_checkpoint) + if config.use_tpu: + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + utils.log("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + utils.log(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer( model.total_loss, config.learning_rate, config.num_train_steps, @@ -284,6 +313,7 @@ def model_fn(features, labels, mode, params): mode=mode, loss=model.total_loss, train_op=train_op, + scaffold_fn=scaffold_fn, training_hooks=[training_utils.ETAHook( {} if config.use_tpu else dict(loss=model.total_loss), config.num_train_steps, config.iterations_per_loop, @@ -293,6 +323,7 @@ def model_fn(features, labels, mode, params): output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=model.total_loss, + scaffold_fn=scaffold_fn, eval_metrics=model.eval_metrics, evaluation_hooks=[training_utils.ETAHook( {} if config.use_tpu else dict(loss=model.total_loss),