diff --git a/examples/finetune_modernbert_on_glue.ipynb b/examples/finetune_modernbert_on_glue.ipynb
index f8b17bdc..5bbebffa 100644
--- a/examples/finetune_modernbert_on_glue.ipynb
+++ b/examples/finetune_modernbert_on_glue.ipynb
@@ -348,7 +348,7 @@
     "    \"rte\": {\n",
     "        \"abbr\": \"RTE\",\n",
     "        \"name\": \"Recognize Textual Entailment\",\n",
-    "        \"description\": \"Predict whether one sentece entails another\",\n",
+    "        \"description\": \"Predict whether one sentence entails another\",\n",
     "        \"task_type\": \"Inference Tasks\",\n",
     "        \"domain\": \"News, Wikipedia\",\n",
     "        \"size\": \"2.5k\",\n",
@@ -528,7 +528,7 @@
    "source": [
     "### Tokenizer\n",
     "\n",
-    "Next we define our Tokenizer and a preprocess function to create the input_ids, attention_mask, and token_type_ids the model nees to train.  For this example, including `truncation=True` is enough as we'll rely on our data collation function below to put our batches into the correct shape."
+    "Next we define our Tokenizer and a preprocess function to create the input_ids, attention_mask, and token_type_ids the model needs to train.  For this example, including `truncation=True` is enough as we'll rely on our data collation function below to put our batches into the correct shape."
    ]
   },
   {
diff --git a/main.py b/main.py
index 6725689c..fa3e7aa3 100755
--- a/main.py
+++ b/main.py
@@ -361,7 +361,7 @@ def init_from_checkpoint(cfg: DictConfig, new_model: nn.Module):
         new_model=new_model.model,
         mode=cfg.get("mode", "tile_weights_from_middle"),
     )
-    print(f"Initalized model from checkpoint {cfg.checkpoint_run_name} with {n_params=:.4e} parameters")
+    print(f"Initialized model from checkpoint {cfg.checkpoint_run_name} with {n_params=:.4e} parameters")
 
 
 def main(cfg: DictConfig, return_trainer: bool = False, do_train: bool = True) -> Optional[Trainer]:
diff --git a/src/bert_layers/configuration_bert.py b/src/bert_layers/configuration_bert.py
index 6fbdeb53..8e94d017 100644
--- a/src/bert_layers/configuration_bert.py
+++ b/src/bert_layers/configuration_bert.py
@@ -24,7 +24,7 @@ def __init__(
                 create when initializing the model. You should be able to ignore this parameter in most cases.
                 Defaults to 512.
             attention_probs_dropout_prob (float): By default, turn off attention dropout in MosaicBERT
-                Note that the custom Triton Flash Attention with ALiBi implementation does not support droput.
+                Note that the custom Triton Flash Attention with ALiBi implementation does not support dropout.
                 However, Flash Attention 2 supports ALiBi and dropout https://github.com/Dao-AILab/flash-attention
             embed_dropout_prob (float): Dropout probability for the embedding layer.
             attn_out_dropout_prob (float): Dropout probability for the attention output layer.
@@ -155,7 +155,7 @@ def __init__(
             unpad_embeddings (bool): Unpad inputs before the embedding layer.
             pad_logits (bool): Pad logits after the calculating the loss.
             compile_model (bool): Compile the subset of the model which can be compiled.
-            masked_prediction (bool): Use only pass the masked tokens throught the final MLM layers
+            masked_prediction (bool): Use only pass the masked tokens through the final MLM layers
             **kwargs: Additional keyword arguments.
         """
         super().__init__(attention_probs_dropout_prob=attention_probs_dropout_prob, **kwargs)
diff --git a/src/bert_layers/rotary.py b/src/bert_layers/rotary.py
index 189d6a86..4f39fd40 100644
--- a/src/bert_layers/rotary.py
+++ b/src/bert_layers/rotary.py
@@ -186,7 +186,7 @@ def __init__(
             we add this option.
         max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
             up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
-            the cos_sin_cache wll be recomputed during the forward pass.
+            the cos_sin_cache will be recomputed during the forward pass.
         """
         super().__init__()
         self.dim = dim
diff --git a/src/evals/README.md b/src/evals/README.md
index c898e9c8..1855a8de 100644
--- a/src/evals/README.md
+++ b/src/evals/README.md
@@ -40,7 +40,7 @@ python eval.py yamls/ablations/checkpoint_name.yaml
 
 ## Automatically generate eval configs for multiple checkpoints and run evals on multiple GPUs
 
-`run_evals_from_checkpoints.py` can be used to automatically generate configs from the latest checkpoints in a given directory, and run all evals on all avalible GPUs.
+`run_evals_from_checkpoints.py` can be used to automatically generate configs from the latest checkpoints in a given directory, and run all evals on all available GPUs.
 
 Run `python run_evals_from_checkpoints.py --help` for all options. All options from `generate_eval_config_from_checkpoint.py` are also available.
 
diff --git a/src/flex_bert.py b/src/flex_bert.py
index c4728fbd..bb8d9e47 100644
--- a/src/flex_bert.py
+++ b/src/flex_bert.py
@@ -41,7 +41,7 @@
 all = ["create_flex_bert_mlm", "create_flex_bert_classification"]
 
 
-# we want the efficent versions to have the same name as the TorchMetrics' name
+# we want the efficient versions to have the same name as the TorchMetrics' name
 def rename_class(new_name):
     def class_renamer(cls):
         cls.__name__ = new_name
@@ -398,7 +398,7 @@ def create_flex_bert_classification(
         First, it will switch the training loss to :class:`~torch.nn.MSELoss`.
         Second, the returned :class:`.ComposerModel`'s train/validation metrics
         will be :class:`~torchmetrics.MeanSquaredError` and
-        :class:`~torchmetrics.SpearmanCorrCoef`. For the classifcation case
+        :class:`~torchmetrics.SpearmanCorrCoef`. For the classification case
         (when ``num_labels > 1``), the training loss is
         :class:`~torch.nn.CrossEntropyLoss`, and the train/validation
         metrics are :class:`~torchmetrics.MulticlassAccuracy` and
diff --git a/src/hf_bert.py b/src/hf_bert.py
index f761821b..a25bd9e2 100644
--- a/src/hf_bert.py
+++ b/src/hf_bert.py
@@ -186,7 +186,7 @@ def create_hf_bert_classification(
         This will have two noteworthy effects. First, it will switch the training loss to :class:`~torch.nn.MSELoss`.
         Second, the returned :class:`.ComposerModel`'s train/validation metrics will be :class:`~torchmetrics.MeanSquaredError` and :class:`~torchmetrics.SpearmanCorrCoef`.
 
-        For the classifcation case (when ``num_labels > 1``), the training loss is :class:`~torch.nn.CrossEntropyLoss`, and the train/validation
+        For the classification case (when ``num_labels > 1``), the training loss is :class:`~torch.nn.CrossEntropyLoss`, and the train/validation
         metrics are :class:`~torchmetrics.MulticlassAccuracy` and :class:`~torchmetrics.MatthewsCorrCoef`, as well as :class:`.BinaryF1Score` if ``num_labels == 2``.
     """
     try:
diff --git a/src/mosaic_bert.py b/src/mosaic_bert.py
index 702cf812..1063e8d3 100644
--- a/src/mosaic_bert.py
+++ b/src/mosaic_bert.py
@@ -230,7 +230,7 @@ def create_mosaic_bert_classification(
         First, it will switch the training loss to :class:`~torch.nn.MSELoss`.
         Second, the returned :class:`.ComposerModel`'s train/validation metrics
         will be :class:`~torchmetrics.MeanSquaredError` and
-        :class:`~torchmetrics.SpearmanCorrCoef`. For the classifcation case
+        :class:`~torchmetrics.SpearmanCorrCoef`. For the classification case
         (when ``num_labels > 1``), the training loss is
         :class:`~torch.nn.CrossEntropyLoss`, and the train/validation
         metrics are :class:`~torchmetrics.MulticlassAccuracy` and
diff --git a/wandb_log_live_eval.py b/wandb_log_live_eval.py
index 2b4bbf2a..df1db77f 100644
--- a/wandb_log_live_eval.py
+++ b/wandb_log_live_eval.py
@@ -45,7 +45,7 @@ def process_data(args):
         try:
             meta = parse_model_string(run.name)
         except ValueError:
-            print(f"Skipping run with unparseable name: {run.name}")
+            print(f"Skipping run with unparsable name: {run.name}")
             continue
         task = meta["task"]
         summary = run.summary