diff --git a/.github/workflows/build_and_deploy_documentation.yml b/.github/workflows/build_and_deploy_documentation.yml index c6595f53d..32096ed4b 100644 --- a/.github/workflows/build_and_deploy_documentation.yml +++ b/.github/workflows/build_and_deploy_documentation.yml @@ -25,7 +25,7 @@ jobs: run: | sudo apt-get update sudo apt-get install git -y - python -m pip install torch==2.6.0 + python -m pip install torch==2.7.1 python -m pip install --upgrade pip setuptools wheel export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE python -m pip install -e . diff --git a/.github/workflows/tests_full.yml b/.github/workflows/tests_full.yml index 72deb4490..a1326c62f 100644 --- a/.github/workflows/tests_full.yml +++ b/.github/workflows/tests_full.yml @@ -23,11 +23,11 @@ jobs: sudo apt-get update sudo apt-get install curl -y # required by coveralls sudo apt-get install git -y - python -m pip install torch==2.6.0 + python -m pip install torch==2.7.1 python -m pip install --upgrade pip setuptools wheel export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE python -m pip install ninja # Lowers compilation time of flash attention significantly - python -m pip install flash-attn==2.7.4.post1 --no-build-isolation + python -m pip install flash-attn==2.8.0.post2 --no-build-isolation python -m pip install -e .[tests] - name: Run tests run: | diff --git a/README.md b/README.md index 1e6bdd569..686dc1ebe 100644 --- a/README.md +++ b/README.md @@ -42,9 +42,9 @@ conda create -n modalities python=3.11 conda activate modalities # install PyTorch, Ninja and Flash Attention (mandatory) -pip install torch==2.6.0 +pip install torch==2.7.1 pip install ninja # Lowers compilation time of flash attention significantly -pip install flash-attn==2.7.4.post1 --no-build-isolation +pip install flash-attn==2.8.0.post2 --no-build-isolation ``` ### Option 1: Installation from source @@ -80,9 +80,9 @@ pip install modalities curl -LsSf https://astral.sh/uv/install.sh | sh uv venv --seed --python 3.11 --prompt modalities source .venv/bin/activate -uv pip install torch +uv pip install torch==2.7.1 uv pip install ninja -uv pip install --no-build-isolation flash-attn==2.7.4.post1 +uv pip install --no-build-isolation flash-attn==2.8.0.post2 # for developer: use [tests,linting] and install pre-commit hooks uv pip install -e .[tests,linting] pre-commit install --install-hooks diff --git a/config_files/training/config_lorem_ipsum_long_fsdp1_warmstart.yaml b/config_files/training/config_lorem_ipsum_long_fsdp1_warmstart.yaml index 98b66edc5..7235950a0 100644 --- a/config_files/training/config_lorem_ipsum_long_fsdp1_warmstart.yaml +++ b/config_files/training/config_lorem_ipsum_long_fsdp1_warmstart.yaml @@ -67,7 +67,7 @@ settings: variant_key: last_step_from_checkpoint_path config: checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path} - warmstart_checkpoint_paths: ${warmstart_env:checkpoint_paths} + warmstart_checkpoint_paths: ${warmstart_env:checkpoint_paths} # use modalities warmstart [..] --last_checkpoint_info_file_path [..] collate_fn: component_key: collate_fn diff --git a/config_files/training/config_lorem_ipsum_long_fsdp2_warmstart.yaml b/config_files/training/config_lorem_ipsum_long_fsdp2_warmstart.yaml index c502acae9..d7b465364 100644 --- a/config_files/training/config_lorem_ipsum_long_fsdp2_warmstart.yaml +++ b/config_files/training/config_lorem_ipsum_long_fsdp2_warmstart.yaml @@ -67,8 +67,7 @@ settings: variant_key: last_step_from_checkpoint_path config: checkpoint_path: ${settings.warmstart_checkpoint_paths.checkpoint_folder_path} - warmstart_checkpoint_paths: # ${warmstart_env:checkpoint_paths} - checkpoint_folder_path: /raid/fromm/modalities/data/checkpoints/2025-04-16__12-40-51_6dcbb1a0/eid_2025-04-16__12-40-51_6dcbb1a0-seen_steps_32-seen_tokens_65536-target_steps_162-target_tokens_331776 + warmstart_checkpoint_paths: ${warmstart_env:checkpoint_paths} # use modalities warmstart [..] --last_checkpoint_info_file_path [..] collate_fn: component_key: collate_fn diff --git a/pyproject.toml b/pyproject.toml index 5a3c84bf1..17bc6ee0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ description = "Modalities, a PyTorch-native framework for distributed and reprod readme = "README.md" dependencies = [ "numpy<2.0", - "torch==2.6.0", + "torch==2.7.1", "packaging", "tqdm", "pyyaml", diff --git a/src/modalities/__main__.py b/src/modalities/__main__.py index 4cd4f6118..82322ee85 100644 --- a/src/modalities/__main__.py +++ b/src/modalities/__main__.py @@ -592,7 +592,7 @@ def build_components(self, components_model_type: Type[BaseModel]) -> BaseModel: return components def run(self, components: TrainingComponentsInstantiationModel): - """Entrypoint fo running the training process. + """Entrypoint for running the training process. We pass in a TrainingComponentsInstantiationModel, which is a pydantic model that contains all the components needed for the training process. diff --git a/src/modalities/logging_broker/subscriber_impl/results_subscriber.py b/src/modalities/logging_broker/subscriber_impl/results_subscriber.py index 05f3c2fd3..aa9aed58f 100644 --- a/src/modalities/logging_broker/subscriber_impl/results_subscriber.py +++ b/src/modalities/logging_broker/subscriber_impl/results_subscriber.py @@ -70,7 +70,12 @@ def __init__( with open(config_file_path, "r", encoding="utf-8") as file: config = yaml.safe_load(file) self.run = wandb.init( - project=project, name=experiment_id, mode=mode.value.lower(), dir=logging_directory, config=config + project=project, + name=experiment_id, + mode=mode.value.lower(), + dir=logging_directory, + config=config, + settings=wandb.Settings(init_timeout=120), ) self.run.log_artifact(config_file_path, name=f"config_{wandb.run.id}", type="config") diff --git a/tutorials/warmstart/configs/warmstart_config.yaml b/tutorials/warmstart/configs/warmstart_config.yaml index 199c88be1..e546ed777 100644 --- a/tutorials/warmstart/configs/warmstart_config.yaml +++ b/tutorials/warmstart/configs/warmstart_config.yaml @@ -50,7 +50,7 @@ settings: variant_key: last_step_from_checkpoint_path config: checkpoint_path: ${settings.warmstart_checkpoint_paths.checkpoint_folder_path} - warmstart_checkpoint_paths: ${warmstart_env:checkpoint_paths} + warmstart_checkpoint_paths: ${warmstart_env:checkpoint_paths} # use modalities warmstart [..] --last_checkpoint_info_file_path [..] collate_fn: component_key: collate_fn