diff --git a/examples/finetune_modernbert_on_glue.ipynb b/examples/finetune_modernbert_on_glue.ipynb index f8b17bdc..5bbebffa 100644 --- a/examples/finetune_modernbert_on_glue.ipynb +++ b/examples/finetune_modernbert_on_glue.ipynb @@ -348,7 +348,7 @@ " \"rte\": {\n", " \"abbr\": \"RTE\",\n", " \"name\": \"Recognize Textual Entailment\",\n", - " \"description\": \"Predict whether one sentece entails another\",\n", + " \"description\": \"Predict whether one sentence entails another\",\n", " \"task_type\": \"Inference Tasks\",\n", " \"domain\": \"News, Wikipedia\",\n", " \"size\": \"2.5k\",\n", @@ -528,7 +528,7 @@ "source": [ "### Tokenizer\n", "\n", - "Next we define our Tokenizer and a preprocess function to create the input_ids, attention_mask, and token_type_ids the model nees to train. For this example, including `truncation=True` is enough as we'll rely on our data collation function below to put our batches into the correct shape." + "Next we define our Tokenizer and a preprocess function to create the input_ids, attention_mask, and token_type_ids the model needs to train. For this example, including `truncation=True` is enough as we'll rely on our data collation function below to put our batches into the correct shape." ] }, { diff --git a/main.py b/main.py index 6725689c..fa3e7aa3 100755 --- a/main.py +++ b/main.py @@ -361,7 +361,7 @@ def init_from_checkpoint(cfg: DictConfig, new_model: nn.Module): new_model=new_model.model, mode=cfg.get("mode", "tile_weights_from_middle"), ) - print(f"Initalized model from checkpoint {cfg.checkpoint_run_name} with {n_params=:.4e} parameters") + print(f"Initialized model from checkpoint {cfg.checkpoint_run_name} with {n_params=:.4e} parameters") def main(cfg: DictConfig, return_trainer: bool = False, do_train: bool = True) -> Optional[Trainer]: diff --git a/src/bert_layers/configuration_bert.py b/src/bert_layers/configuration_bert.py index 6fbdeb53..8e94d017 100644 --- a/src/bert_layers/configuration_bert.py +++ b/src/bert_layers/configuration_bert.py @@ -24,7 +24,7 @@ def __init__( create when initializing the model. You should be able to ignore this parameter in most cases. Defaults to 512. attention_probs_dropout_prob (float): By default, turn off attention dropout in MosaicBERT - Note that the custom Triton Flash Attention with ALiBi implementation does not support droput. + Note that the custom Triton Flash Attention with ALiBi implementation does not support dropout. However, Flash Attention 2 supports ALiBi and dropout https://github.com/Dao-AILab/flash-attention embed_dropout_prob (float): Dropout probability for the embedding layer. attn_out_dropout_prob (float): Dropout probability for the attention output layer. @@ -155,7 +155,7 @@ def __init__( unpad_embeddings (bool): Unpad inputs before the embedding layer. pad_logits (bool): Pad logits after the calculating the loss. compile_model (bool): Compile the subset of the model which can be compiled. - masked_prediction (bool): Use only pass the masked tokens throught the final MLM layers + masked_prediction (bool): Use only pass the masked tokens through the final MLM layers **kwargs: Additional keyword arguments. """ super().__init__(attention_probs_dropout_prob=attention_probs_dropout_prob, **kwargs) diff --git a/src/bert_layers/rotary.py b/src/bert_layers/rotary.py index 189d6a86..4f39fd40 100644 --- a/src/bert_layers/rotary.py +++ b/src/bert_layers/rotary.py @@ -186,7 +186,7 @@ def __init__( we add this option. max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ, - the cos_sin_cache wll be recomputed during the forward pass. + the cos_sin_cache will be recomputed during the forward pass. """ super().__init__() self.dim = dim diff --git a/src/evals/README.md b/src/evals/README.md index c898e9c8..1855a8de 100644 --- a/src/evals/README.md +++ b/src/evals/README.md @@ -40,7 +40,7 @@ python eval.py yamls/ablations/checkpoint_name.yaml ## Automatically generate eval configs for multiple checkpoints and run evals on multiple GPUs -`run_evals_from_checkpoints.py` can be used to automatically generate configs from the latest checkpoints in a given directory, and run all evals on all avalible GPUs. +`run_evals_from_checkpoints.py` can be used to automatically generate configs from the latest checkpoints in a given directory, and run all evals on all available GPUs. Run `python run_evals_from_checkpoints.py --help` for all options. All options from `generate_eval_config_from_checkpoint.py` are also available. diff --git a/src/flex_bert.py b/src/flex_bert.py index c4728fbd..bb8d9e47 100644 --- a/src/flex_bert.py +++ b/src/flex_bert.py @@ -41,7 +41,7 @@ all = ["create_flex_bert_mlm", "create_flex_bert_classification"] -# we want the efficent versions to have the same name as the TorchMetrics' name +# we want the efficient versions to have the same name as the TorchMetrics' name def rename_class(new_name): def class_renamer(cls): cls.__name__ = new_name @@ -398,7 +398,7 @@ def create_flex_bert_classification( First, it will switch the training loss to :class:`~torch.nn.MSELoss`. Second, the returned :class:`.ComposerModel`'s train/validation metrics will be :class:`~torchmetrics.MeanSquaredError` and - :class:`~torchmetrics.SpearmanCorrCoef`. For the classifcation case + :class:`~torchmetrics.SpearmanCorrCoef`. For the classification case (when ``num_labels > 1``), the training loss is :class:`~torch.nn.CrossEntropyLoss`, and the train/validation metrics are :class:`~torchmetrics.MulticlassAccuracy` and diff --git a/src/hf_bert.py b/src/hf_bert.py index f761821b..a25bd9e2 100644 --- a/src/hf_bert.py +++ b/src/hf_bert.py @@ -186,7 +186,7 @@ def create_hf_bert_classification( This will have two noteworthy effects. First, it will switch the training loss to :class:`~torch.nn.MSELoss`. Second, the returned :class:`.ComposerModel`'s train/validation metrics will be :class:`~torchmetrics.MeanSquaredError` and :class:`~torchmetrics.SpearmanCorrCoef`. - For the classifcation case (when ``num_labels > 1``), the training loss is :class:`~torch.nn.CrossEntropyLoss`, and the train/validation + For the classification case (when ``num_labels > 1``), the training loss is :class:`~torch.nn.CrossEntropyLoss`, and the train/validation metrics are :class:`~torchmetrics.MulticlassAccuracy` and :class:`~torchmetrics.MatthewsCorrCoef`, as well as :class:`.BinaryF1Score` if ``num_labels == 2``. """ try: diff --git a/src/mosaic_bert.py b/src/mosaic_bert.py index 702cf812..1063e8d3 100644 --- a/src/mosaic_bert.py +++ b/src/mosaic_bert.py @@ -230,7 +230,7 @@ def create_mosaic_bert_classification( First, it will switch the training loss to :class:`~torch.nn.MSELoss`. Second, the returned :class:`.ComposerModel`'s train/validation metrics will be :class:`~torchmetrics.MeanSquaredError` and - :class:`~torchmetrics.SpearmanCorrCoef`. For the classifcation case + :class:`~torchmetrics.SpearmanCorrCoef`. For the classification case (when ``num_labels > 1``), the training loss is :class:`~torch.nn.CrossEntropyLoss`, and the train/validation metrics are :class:`~torchmetrics.MulticlassAccuracy` and diff --git a/wandb_log_live_eval.py b/wandb_log_live_eval.py index 2b4bbf2a..df1db77f 100644 --- a/wandb_log_live_eval.py +++ b/wandb_log_live_eval.py @@ -45,7 +45,7 @@ def process_data(args): try: meta = parse_model_string(run.name) except ValueError: - print(f"Skipping run with unparseable name: {run.name}") + print(f"Skipping run with unparsable name: {run.name}") continue task = meta["task"] summary = run.summary