From 20fac0028adffba9fa89ef6f5c68b5327a553348 Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Wed, 8 Oct 2025 11:00:52 +0200 Subject: [PATCH 01/10] chore(cache): for generating test outputs --- tests/resources/generate-outputs.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/resources/generate-outputs.sh b/tests/resources/generate-outputs.sh index 921895d1a6..4d3bcb08c2 100755 --- a/tests/resources/generate-outputs.sh +++ b/tests/resources/generate-outputs.sh @@ -7,9 +7,9 @@ ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) cd "$ROOT_DIR" -mtt train options.yaml -o model-32-bit.pt -r base_precision=32 -mtt train options.yaml -o model-64-bit.pt -r base_precision=64 -mtt train options-pet.yaml -o model-pet.pt +[ -f "model-32-bit.pt" ] || mtt train options.yaml -o model-32-bit.pt -r base_precision=32 +[ -f "model-64-bit.pt" ] || mtt train options.yaml -o model-64-bit.pt -r base_precision=64 +[ -f "model-pet.pt" ] || mtt train options-pet.yaml -o model-pet.pt set +x # disable command echoing for sensitive private token check TOKEN_PRESENT=false From a499e60d552027ce1ca2249b67b79e9e3eb025b4 Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Wed, 8 Oct 2025 11:06:34 +0200 Subject: [PATCH 02/10] doc(contrib): update with new caching data details --- CONTRIBUTING.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index bd7ed2ea37..394f835a5d 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -89,6 +89,14 @@ testing it. Also, you may want to setup your editor to automatically apply the ` are plugins to do this with `all major editors `_. +The main test suite relies on pre-generated model files that are cached for performance. +If you need to force a regeneration of these files, you can set the ``FORCE_REGENERATE`` +environment variable to ``1`` before running the tests: + +.. code-block:: bash + + FORCE_REGENERATE=1 tox -e tests + If you want to test a specific archicture you can also do it. For example .. code-block:: bash From 4f58bdfefc6d7205af5a986abc41f88870c468ec Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Wed, 8 Oct 2025 11:07:02 +0200 Subject: [PATCH 03/10] enh(cache): now check for envvar --- tests/resources/generate-outputs.sh | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/resources/generate-outputs.sh b/tests/resources/generate-outputs.sh index 4d3bcb08c2..f7da9d0315 100755 --- a/tests/resources/generate-outputs.sh +++ b/tests/resources/generate-outputs.sh @@ -7,9 +7,24 @@ ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) cd "$ROOT_DIR" -[ -f "model-32-bit.pt" ] || mtt train options.yaml -o model-32-bit.pt -r base_precision=32 -[ -f "model-64-bit.pt" ] || mtt train options.yaml -o model-64-bit.pt -r base_precision=64 -[ -f "model-pet.pt" ] || mtt train options-pet.yaml -o model-pet.pt +FORCE_REGENERATE=false +if [[ "${FORCE_REGENERATE:-0}" == "1" ]]; then + echo "FORCE_REGENERATE=1 detected. Regenerating all models." + FORCE_REGENERATE=true +fi + +# Regenerate if --force is used OR if the file doesn't exist +if [ "$FORCE_REGENERATE" = true ] || [ ! -f "model-32-bit.pt" ]; then + mtt train options.yaml -o model-32-bit.pt -r base_precision=32 +fi + +if [ "$FORCE_REGENERATE" = true ] || [ ! -f "model-64-bit.pt" ]; then + mtt train options.yaml -o model-64-bit.pt -r base_precision=64 +fi + +if [ "$FORCE_REGENERATE" = true ] || [ ! -f "model-pet.pt" ]; then + mtt train options-pet.yaml -o model-pet.pt +fi set +x # disable command echoing for sensitive private token check TOKEN_PRESENT=false From f7e9113fc385dd6ab5ac25d5976582681392694b Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Wed, 8 Oct 2025 11:25:30 +0200 Subject: [PATCH 04/10] chore(ci): always regenerate data for CI --- .github/workflows/architecture-tests.yml | 2 ++ .github/workflows/build.yml | 2 ++ .github/workflows/docs.yml | 2 ++ .github/workflows/tests.yml | 2 ++ 4 files changed, 8 insertions(+) diff --git a/.github/workflows/architecture-tests.yml b/.github/workflows/architecture-tests.yml index 4f412c5f49..e0bc06269a 100644 --- a/.github/workflows/architecture-tests.yml +++ b/.github/workflows/architecture-tests.yml @@ -33,5 +33,7 @@ jobs: - name: run architecture tests run: tox -e ${{ matrix.architecture-name }}-tests env: + # CI should always generate test files + FORCE_REGENERATE: true # Use the CPU only version of torch when building/running the code PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 588a85d823..e9574344de 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,5 +20,7 @@ jobs: - name: Test build integrity run: tox -e build env: + # CI should always generate test files + FORCE_REGENERATE: true # Use the CPU only version of torch when building/running the code PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index ec47d93599..baf8eb8ad5 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -23,6 +23,8 @@ jobs: - name: build documentation run: tox -e docs env: + # CI should always generate test files + FORCE_REGENERATE: true # Use the CPU-only version of torch PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index dd8d14339b..2d7f2b531a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -44,6 +44,8 @@ jobs: tox -e tests coverage xml --data-file tests/.coverage env: + # CI should always generate test files + FORCE_REGENERATE: true # Use the CPU only version of torch when building/running the code PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu HUGGINGFACE_TOKEN_METATRAIN: ${{ secrets.HUGGINGFACE_TOKEN }} From 926e6f16f7b127ed208c0efcd9209ce15835b328 Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Wed, 8 Oct 2025 11:27:57 +0200 Subject: [PATCH 05/10] enh(cache): now check the githash for changes --- .gitignore | 3 +++ tests/resources/generate-outputs.sh | 20 +++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ff5d46f744..dce1093ada 100644 --- a/.gitignore +++ b/.gitignore @@ -179,3 +179,6 @@ docs/src/examples node_modules/ package-lock.json package.json + +# caching githash +.data_version.txt diff --git a/tests/resources/generate-outputs.sh b/tests/resources/generate-outputs.sh index f7da9d0315..6f82a113f1 100755 --- a/tests/resources/generate-outputs.sh +++ b/tests/resources/generate-outputs.sh @@ -7,10 +7,23 @@ ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) cd "$ROOT_DIR" +HASH_FILE=".data_version.txt" FORCE_REGENERATE=false if [[ "${FORCE_REGENERATE:-0}" == "1" ]]; then - echo "FORCE_REGENERATE=1 detected. Regenerating all models." + echo "FORCE_REGENERATE=1 detected. Forcing regeneration of all models." FORCE_REGENERATE=true +else + if [ -f "$HASH_FILE" ]; then + SAVED_HASH=$(cat "$HASH_FILE") + CURRENT_HASH=$(git rev-parse HEAD) + if [ "$SAVED_HASH" != "$CURRENT_HASH" ]; then + echo "Git commit has changed. Forcing regeneration of all models." + FORCE_REGENERATE=true + fi + else + echo "Hash file not found. Forcing regeneration of all models." + FORCE_REGENERATE=true + fi fi # Regenerate if --force is used OR if the file doesn't exist @@ -26,6 +39,11 @@ if [ "$FORCE_REGENERATE" = true ] || [ ! -f "model-pet.pt" ]; then mtt train options-pet.yaml -o model-pet.pt fi +if [ "$FORCE_REGENERATE" = true ]; then + echo "Saving current git commit hash to version the data." + git rev-parse HEAD > "$HASH_FILE" +fi + set +x # disable command echoing for sensitive private token check TOKEN_PRESENT=false if [[ -n "${HUGGINGFACE_TOKEN_METATRAIN:-}" ]]; then From d7ff8077af94ceea944db371f2f5b69a864eb036 Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Wed, 8 Oct 2025 12:45:51 +0200 Subject: [PATCH 06/10] enh(cache): have a hashlist cache --- tests/resources/generate-outputs.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/resources/generate-outputs.sh b/tests/resources/generate-outputs.sh index 6f82a113f1..e2c286cd39 100755 --- a/tests/resources/generate-outputs.sh +++ b/tests/resources/generate-outputs.sh @@ -8,10 +8,15 @@ ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) cd "$ROOT_DIR" HASH_FILE=".data_version.txt" +# Things +WATCH_PATHS="src/" FORCE_REGENERATE=false if [[ "${FORCE_REGENERATE:-0}" == "1" ]]; then echo "FORCE_REGENERATE=1 detected. Forcing regeneration of all models." FORCE_REGENERATE=true +elif [ -n "$(git status --porcelain -- $WATCH_PATHS)" ]; then + echo "Uncommitted git changes detected in critical files. Regenerating." + FORCE_REGENERATE=true else if [ -f "$HASH_FILE" ]; then SAVED_HASH=$(cat "$HASH_FILE") From 1185f08c6638a6c32905c7a14e783607fe73846d Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Wed, 8 Oct 2025 13:38:26 +0200 Subject: [PATCH 07/10] enh(cache): fancier with associative arrays --- tests/resources/generate-outputs.sh | 40 +++++++++++++++++++---------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/tests/resources/generate-outputs.sh b/tests/resources/generate-outputs.sh index e2c286cd39..06c305e51d 100755 --- a/tests/resources/generate-outputs.sh +++ b/tests/resources/generate-outputs.sh @@ -3,6 +3,15 @@ set -eux echo "Generating data for testing..." +# Define all model parameters in an associative array (like a dictionary) +# Key: The output filename +# Value: A semicolon-separated string of "config_file;extra_arg1;extra_arg2;..." +declare -A models=( + ["model-32-bit.pt"]="options.yaml;-r;base_precision=32" + ["model-64-bit.pt"]="options.yaml;-r;base_precision=64" + ["model-pet.pt"]="options-pet.yaml" +) + ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) cd "$ROOT_DIR" @@ -32,22 +41,27 @@ else fi # Regenerate if --force is used OR if the file doesn't exist -if [ "$FORCE_REGENERATE" = true ] || [ ! -f "model-32-bit.pt" ]; then - mtt train options.yaml -o model-32-bit.pt -r base_precision=32 -fi +for model_file in "${!models[@]}"; do + # Regenerate if --force is used OR if the file doesn't exist + if [ "$FORCE_REGENERATE" = true ] || [ ! -f "$model_file" ]; then + echo "Generating '$model_file'..." -if [ "$FORCE_REGENERATE" = true ] || [ ! -f "model-64-bit.pt" ]; then - mtt train options.yaml -o model-64-bit.pt -r base_precision=64 -fi + # Read the parameter string for the current model + params_str=${models["$model_file"]} -if [ "$FORCE_REGENERATE" = true ] || [ ! -f "model-pet.pt" ]; then - mtt train options-pet.yaml -o model-pet.pt -fi + # Safely split the string into a temporary array using ';' as the delimiter + IFS=';' read -r -a params_array <<< "$params_str" -if [ "$FORCE_REGENERATE" = true ]; then - echo "Saving current git commit hash to version the data." - git rev-parse HEAD > "$HASH_FILE" -fi + # The first element is the config file + config_file=${params_array[0]} + + # The rest of the elements are extra arguments for the command + extra_args=("${params_array[@]:1}") + + # Execute the command, safely passing the arguments + mtt train "$config_file" -o "$model_file" "${extra_args[@]}" + fi +done set +x # disable command echoing for sensitive private token check TOKEN_PRESENT=false From 3ec9dd68b592a25944a389b608a48de20d80cc4c Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Wed, 8 Oct 2025 13:40:23 +0200 Subject: [PATCH 08/10] Revert "enh(cache): fancier with associative arrays" This reverts commit 527daae7f64c6c76a658251dd5656d2ed7bd213c. --- tests/resources/generate-outputs.sh | 40 ++++++++++------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/tests/resources/generate-outputs.sh b/tests/resources/generate-outputs.sh index 06c305e51d..e2c286cd39 100755 --- a/tests/resources/generate-outputs.sh +++ b/tests/resources/generate-outputs.sh @@ -3,15 +3,6 @@ set -eux echo "Generating data for testing..." -# Define all model parameters in an associative array (like a dictionary) -# Key: The output filename -# Value: A semicolon-separated string of "config_file;extra_arg1;extra_arg2;..." -declare -A models=( - ["model-32-bit.pt"]="options.yaml;-r;base_precision=32" - ["model-64-bit.pt"]="options.yaml;-r;base_precision=64" - ["model-pet.pt"]="options-pet.yaml" -) - ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) cd "$ROOT_DIR" @@ -41,27 +32,22 @@ else fi # Regenerate if --force is used OR if the file doesn't exist -for model_file in "${!models[@]}"; do - # Regenerate if --force is used OR if the file doesn't exist - if [ "$FORCE_REGENERATE" = true ] || [ ! -f "$model_file" ]; then - echo "Generating '$model_file'..." - - # Read the parameter string for the current model - params_str=${models["$model_file"]} - - # Safely split the string into a temporary array using ';' as the delimiter - IFS=';' read -r -a params_array <<< "$params_str" +if [ "$FORCE_REGENERATE" = true ] || [ ! -f "model-32-bit.pt" ]; then + mtt train options.yaml -o model-32-bit.pt -r base_precision=32 +fi - # The first element is the config file - config_file=${params_array[0]} +if [ "$FORCE_REGENERATE" = true ] || [ ! -f "model-64-bit.pt" ]; then + mtt train options.yaml -o model-64-bit.pt -r base_precision=64 +fi - # The rest of the elements are extra arguments for the command - extra_args=("${params_array[@]:1}") +if [ "$FORCE_REGENERATE" = true ] || [ ! -f "model-pet.pt" ]; then + mtt train options-pet.yaml -o model-pet.pt +fi - # Execute the command, safely passing the arguments - mtt train "$config_file" -o "$model_file" "${extra_args[@]}" - fi -done +if [ "$FORCE_REGENERATE" = true ]; then + echo "Saving current git commit hash to version the data." + git rev-parse HEAD > "$HASH_FILE" +fi set +x # disable command echoing for sensitive private token check TOKEN_PRESENT=false From bcc792ed497d45f9f8c8659e4e6b1f9bdc2f16d3 Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Wed, 8 Oct 2025 17:02:00 +0200 Subject: [PATCH 09/10] chore(cache): opt-in instead of opt-out --- tests/resources/generate-outputs.sh | 45 ++++++++++++++++------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/tests/resources/generate-outputs.sh b/tests/resources/generate-outputs.sh index e2c286cd39..bed498f338 100755 --- a/tests/resources/generate-outputs.sh +++ b/tests/resources/generate-outputs.sh @@ -8,30 +8,35 @@ ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) cd "$ROOT_DIR" HASH_FILE=".data_version.txt" -# Things WATCH_PATHS="src/" -FORCE_REGENERATE=false -if [[ "${FORCE_REGENERATE:-0}" == "1" ]]; then - echo "FORCE_REGENERATE=1 detected. Forcing regeneration of all models." - FORCE_REGENERATE=true -elif [ -n "$(git status --porcelain -- $WATCH_PATHS)" ]; then - echo "Uncommitted git changes detected in critical files. Regenerating." - FORCE_REGENERATE=true -else - if [ -f "$HASH_FILE" ]; then - SAVED_HASH=$(cat "$HASH_FILE") - CURRENT_HASH=$(git rev-parse HEAD) - if [ "$SAVED_HASH" != "$CURRENT_HASH" ]; then - echo "Git commit has changed. Forcing regeneration of all models." - FORCE_REGENERATE=true +FORCE_REGENERATE=true + +if [[ "${USE_CACHE:-0}" == "1" ]]; then + echo "USE_CACHE=1 detected. Attempting to use cached data." + CACHE_IS_VALID=true + if [ -n "$(git status --porcelain -- $WATCH_PATHS)" ]; then + echo "Cache is invalid due to uncommitted changes. Must regenerate." + CACHE_IS_VALID=false + elif [ ! -f "$HASH_FILE" ]; then + echo "Cache is invalid: version file not found. Must regenerate." + CACHE_IS_VALID=false + else + SAVED_HASH=$(cat "$HASH_FILE") + CURRENT_HASH=$(git rev-parse HEAD) + if [ "$SAVED_HASH" != "$CURRENT_HASH" ]; then + echo "Cache is invalid: code version has changed. Must regenerate." + CACHE_IS_VALID=false + fi + fi + + # If all checks passed, we can rely on the cache. + if [ "$CACHE_IS_VALID" = true ]; then + echo "Cache is valid. Will skip regeneration for existing files." + FORCE_REGENERATE=false fi - else - echo "Hash file not found. Forcing regeneration of all models." - FORCE_REGENERATE=true - fi fi -# Regenerate if --force is used OR if the file doesn't exist +# Regenerate if regeneration is forced (default) OR if a file is missing. if [ "$FORCE_REGENERATE" = true ] || [ ! -f "model-32-bit.pt" ]; then mtt train options.yaml -o model-32-bit.pt -r base_precision=32 fi From 4bbae30459ca4b367f5b42a9dc49a48cee491bd9 Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Wed, 8 Oct 2025 17:02:55 +0200 Subject: [PATCH 10/10] doc(cache): update with opt-in --- CONTRIBUTING.rst | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 394f835a5d..774f1b8e02 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -89,13 +89,16 @@ testing it. Also, you may want to setup your editor to automatically apply the ` are plugins to do this with `all major editors `_. -The main test suite relies on pre-generated model files that are cached for performance. -If you need to force a regeneration of these files, you can set the ``FORCE_REGENERATE`` -environment variable to ``1`` before running the tests: +By default, the main test suite regenerates the necessary model files every time +it runs. For faster local development, you can **opt-in** to caching these files +by setting the ``USE_CACHE`` environment variable to ``1``: .. code-block:: bash - FORCE_REGENERATE=1 tox -e tests + USE_CACHE=1 tox -e tests + +When caching is enabled, the script will skip regeneration as long as the cached +files exist and the underlying source code has not changed. If you want to test a specific archicture you can also do it. For example