From 38f3d1b25c37df4eef104308ffa656998475d2c0 Mon Sep 17 00:00:00 2001 From: Tien Tong <35613222+tien-tong@users.noreply.github.com> Date: Mon, 23 Mar 2026 11:46:44 -0400 Subject: [PATCH 1/6] fix sparse checkout for zipped input --- .../templates/determine_zipfilename.sh.jinja2 | 9 ++++-- babs/templates/participant_job.sh.jinja2 | 31 ++++++++++++++----- tests/test_generate_submit_script.py | 25 +++++++++++++++ 3 files changed, 54 insertions(+), 11 deletions(-) diff --git a/babs/templates/determine_zipfilename.sh.jinja2 b/babs/templates/determine_zipfilename.sh.jinja2 index 68214405..dabfcf7d 100644 --- a/babs/templates/determine_zipfilename.sh.jinja2 +++ b/babs/templates/determine_zipfilename.sh.jinja2 @@ -20,14 +20,17 @@ find_single_zip_in_git_tree() {{ '{' }} {% endraw %} fi - printf "%s/%s\n" "${zip_search_path}" "${hits}" + # Repo-relative path only; caller joins with path_in_babs for absolute path from job clone. + printf '%s\n' "${hits}" {{ '}' }} {% for input_dataset in input_datasets %} {% if input_dataset['is_zipped'] %} -{{ input_dataset['name'].upper() }}_ZIP="$(find_single_zip_in_git_tree {{ input_dataset['path_in_babs'] }} {{ input_dataset['name'] }})" +{% set u = input_dataset['name'] | upper %} +{{ u }}_ZIP_REL="$(find_single_zip_in_git_tree {{ input_dataset['path_in_babs'] }} {{ input_dataset['name'] }})" +{{ u }}_ZIP="{{ input_dataset['path_in_babs'] }}/${%raw%}${{{%endraw%}{{ u }}_ZIP_REL{%raw%}}{%endraw%}" echo 'found {{ input_dataset['name'] }} zipfile:' -echo "${%raw%}{{%endraw%}{{ input_dataset['name'].upper() }}_ZIP{%raw%}}{%endraw%}" +echo "${%raw%}{{%endraw%}{{ u }}_ZIP{%raw%}}{%endraw%}" {% endif %} {% endfor %} {% endif %} diff --git a/babs/templates/participant_job.sh.jinja2 b/babs/templates/participant_job.sh.jinja2 index c6051411..43b3fa3a 100644 --- a/babs/templates/participant_job.sh.jinja2 +++ b/babs/templates/participant_job.sh.jinja2 @@ -81,6 +81,18 @@ git checkout -f # Start of the application-specific code: ------------------------------ +# Limit a nested input subdataset checkout to specific paths (relative to that repo root). +# Used for BIDS trees (subject[/session] + dataset_description) and for zipped inputs +# (single repo-relative zip path in ``*_ZIP_REL``, set by zip_locator_text below). +sparse_subdataset_include_only() { + local repo="$1" + shift + [ -n "${repo}" ] && [ -d "${repo}/.git" ] || return 0 + ( cd "${repo}" && \ + git sparse-checkout init --no-cone 2>/dev/null && \ + printf '%s\n' "$@" | git sparse-checkout set --stdin 2>/dev/null ) || true +} + # pull down only needed session path and explicit dataset-level metadata: echo "# Pull down the input session but don't retrieve data contents:" {% for input_dataset in input_datasets %} @@ -93,19 +105,22 @@ datalad get -n "{{ input_dataset['path_in_babs'] }}" {% endif %} {% endfor %} -# Restrict each BIDS input subdataset to the current subject so BIDS apps that index the full dataset (e.g. pybids BIDSLayout) don't try to read other subjects' files, which may not be retrieved +{{ zip_locator_text }} + +# Restrict each input subdataset worktree: BIDS → this subject[/session] only (so e.g. pybids +# does not scan other subjects); zipped → only the one zip file for this job (see *_ZIP_REL). {% for input_dataset in input_datasets %} {% if not input_dataset['is_zipped'] %} -if [ -d "{{ input_dataset['path_in_babs'] }}/.git" ]; then - ( cd "{{ input_dataset['path_in_babs'] }}" && \ - ( git sparse-checkout init --no-cone 2>/dev/null && \ - { echo "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}"; echo 'dataset_description.json'; } | git sparse-checkout set --stdin 2>/dev/null ) ) || true -fi +sparse_subdataset_include_only "{{ input_dataset['path_in_babs'] }}" \ + "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}" \ + "dataset_description.json" +{% else %} +{% set u = input_dataset['name'] | upper %} +sparse_subdataset_include_only "{{ input_dataset['path_in_babs'] }}" \ + "${%raw%}{{%endraw%}{{ u }}_ZIP_REL{%raw%}}{%endraw%}" {% endif %} {% endfor %} -{{ zip_locator_text }} - # Link to shared container image so each job does not re-clone the same image. # If shared path is not available (e.g. Slurm Docker workers), retrieve image in this clone. CONTAINER_SHARED="${PROJECT_ROOT}/analysis/containers/.datalad/environments/{{ container_name }}/image" diff --git a/tests/test_generate_submit_script.py b/tests/test_generate_submit_script.py index 6d0868a3..c488c846 100644 --- a/tests/test_generate_submit_script.py +++ b/tests/test_generate_submit_script.py @@ -141,6 +141,31 @@ def run_shellcheck(script_path): return False, str(e) +def test_zipped_input_subdataset_sparse_checkout(): + """Zipped input subdatasets get nested sparse-checkout to the resolved zip only. + + ``find_single_zip_in_git_tree`` returns a repo-relative path (``*_ZIP_REL``); the full + path ``*_ZIP`` is composed in shell. ``sparse_subdataset_include_only`` is shared with + non-zipped inputs and receives the same relative paths git expects for sparse-checkout. + """ + config_path = NOTEBOOKS_DIR / 'eg_xcpd-0-10-6_linc.yaml' + config = read_yaml(config_path) + script_content = generate_submit_script( + queue_system='slurm', + cluster_resources_config=config['cluster_resources'], + script_preamble=config['script_preamble'], + job_scratch_directory=config['job_compute_space'], + input_datasets=input_datasets_xcpd, + processing_level='subject', + container_name=config_path.name.split('_')[1], + zip_foldernames=config['zip_foldernames'], + ) + assert 'sparse_subdataset_include_only()' in script_content + assert 'FMRIPREP_ZIP_REL="$(find_single_zip_in_git_tree' in script_content + assert 'FMRIPREP_ZIP="inputs/data/${FMRIPREP_ZIP_REL}"' in script_content + assert 'sparse_subdataset_include_only "inputs/data"' in script_content + + def test_generate_submit_script_pipeline(tmp_path): """Test submit script generation for pipeline configuration.""" # Use same pattern as single-app tests: read from existing YAML config From dd8a9e11794f7fbb415e568ac6a499d7881ba2a1 Mon Sep 17 00:00:00 2001 From: Tien Tong <35613222+tien-tong@users.noreply.github.com> Date: Mon, 23 Mar 2026 12:16:55 -0400 Subject: [PATCH 2/6] fix --- babs/templates/determine_zipfilename.sh.jinja2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/babs/templates/determine_zipfilename.sh.jinja2 b/babs/templates/determine_zipfilename.sh.jinja2 index dabfcf7d..1702f5ce 100644 --- a/babs/templates/determine_zipfilename.sh.jinja2 +++ b/babs/templates/determine_zipfilename.sh.jinja2 @@ -28,7 +28,7 @@ find_single_zip_in_git_tree() {{ '{' }} {% if input_dataset['is_zipped'] %} {% set u = input_dataset['name'] | upper %} {{ u }}_ZIP_REL="$(find_single_zip_in_git_tree {{ input_dataset['path_in_babs'] }} {{ input_dataset['name'] }})" -{{ u }}_ZIP="{{ input_dataset['path_in_babs'] }}/${%raw%}${{{%endraw%}{{ u }}_ZIP_REL{%raw%}}{%endraw%}" +{{ u }}_ZIP="{{ input_dataset['path_in_babs'] }}/{% raw %}${{% endraw %}{{ u }}_ZIP_REL{% raw %}}{% endraw %}" echo 'found {{ input_dataset['name'] }} zipfile:' echo "${%raw%}{{%endraw%}{{ u }}_ZIP{%raw%}}{%endraw%}" {% endif %} From 10ecdcaf2dcbd5aadf0f2f7baaef4ea60329dce9 Mon Sep 17 00:00:00 2001 From: Tien Tong <35613222+tien-tong@users.noreply.github.com> Date: Mon, 23 Mar 2026 13:52:22 -0400 Subject: [PATCH 3/6] fix --- babs/generate_submit_script.py | 7 +- .../templates/determine_zipfilename.sh.jinja2 | 18 ++-- babs/templates/participant_job.sh.jinja2 | 89 ++++++++++++------- tests/test_generate_submit_script.py | 25 ------ 4 files changed, 65 insertions(+), 74 deletions(-) diff --git a/babs/generate_submit_script.py b/babs/generate_submit_script.py index c586619d..baa7b88a 100644 --- a/babs/generate_submit_script.py +++ b/babs/generate_submit_script.py @@ -93,14 +93,9 @@ def generate_submit_script( varname_taskid = 'SLURM_ARRAY_TASK_ID' varname_jobid = 'SLURM_ARRAY_JOB_ID' - # If any input dataset is zipped, get the setup for the zipfile locator: - zip_locator_template = env.get_template('determine_zipfilename.sh.jinja2') - zip_locator_text = zip_locator_template.render( + zip_locator_text = env.get_template('determine_zipfilename.sh.jinja2').render( input_datasets=input_datasets, processing_level=processing_level, - has_a_zipped_input_dataset=any( - input_dataset['is_zipped'] for input_dataset in input_datasets - ), ) return participant_job_template.render( diff --git a/babs/templates/determine_zipfilename.sh.jinja2 b/babs/templates/determine_zipfilename.sh.jinja2 index 1702f5ce..98a48c6e 100644 --- a/babs/templates/determine_zipfilename.sh.jinja2 +++ b/babs/templates/determine_zipfilename.sh.jinja2 @@ -1,12 +1,17 @@ +{% if input_datasets | selectattr('is_zipped') | list %} # shellcheck disable=SC1091 -{% if has_a_zipped_input_dataset %} find_single_zip_in_git_tree() {{ '{' }} local zip_search_path="$1" local name="$2" local hits count + # Bounded listing: ``ls-files --with-tree=HEAD`` supports ``:(glob)`` (works with no-checkout clone); grep applies session filter. hits="$( - git -C "${zip_search_path}" ls-tree -r --name-only HEAD \ + git -C "${zip_search_path}" ls-files --with-tree=HEAD -- \ + "${subid}" \ + ":(glob)${subid}_*" \ + ":(glob)${subid}.*" \ + 2>/dev/null | sed '/^$/d' | sort -u \ | grep -E "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}{% raw %}.*${sesid}{% endraw %}{% endif %}{% raw %}.*${name}.*\.zip${% endraw %}" \ || true )" @@ -24,13 +29,4 @@ find_single_zip_in_git_tree() {{ '{' }} printf '%s\n' "${hits}" {{ '}' }} -{% for input_dataset in input_datasets %} -{% if input_dataset['is_zipped'] %} -{% set u = input_dataset['name'] | upper %} -{{ u }}_ZIP_REL="$(find_single_zip_in_git_tree {{ input_dataset['path_in_babs'] }} {{ input_dataset['name'] }})" -{{ u }}_ZIP="{{ input_dataset['path_in_babs'] }}/{% raw %}${{% endraw %}{{ u }}_ZIP_REL{% raw %}}{% endraw %}" -echo 'found {{ input_dataset['name'] }} zipfile:' -echo "${%raw%}{{%endraw%}{{ u }}_ZIP{%raw%}}{%endraw%}" -{% endif %} -{% endfor %} {% endif %} diff --git a/babs/templates/participant_job.sh.jinja2 b/babs/templates/participant_job.sh.jinja2 index 43b3fa3a..a5ed487c 100644 --- a/babs/templates/participant_job.sh.jinja2 +++ b/babs/templates/participant_job.sh.jinja2 @@ -71,56 +71,81 @@ fi git sparse-checkout set \ code \ - containers \ -{% for input_dataset in input_datasets %} - {{ input_dataset['path_in_babs'] }}{% if not loop.last %} \ -{% endif %} -{% endfor %} + containers git checkout -f # Start of the application-specific code: ------------------------------ -# Limit a nested input subdataset checkout to specific paths (relative to that repo root). -# Used for BIDS trees (subject[/session] + dataset_description) and for zipped inputs -# (single repo-relative zip path in ``*_ZIP_REL``, set by zip_locator_text below). -sparse_subdataset_include_only() { - local repo="$1" +{{ zip_locator_text }} + +# Input datasets: ``datalad clone --no-checkout``, sparse paths, ``git checkout -f``. +ensure_input_submodule() { + local mount="$1" + [ -n "${mount}" ] || return 0 + if git -C "${mount}" rev-parse --git-dir >/dev/null 2>&1; then + return 0 + fi + local src + src="$(git config -f .gitmodules --get "submodule.${mount}.datalad-url" 2>/dev/null)" \ + || src="$(git config -f .gitmodules --get "submodule.${mount}.url" 2>/dev/null)" \ + || src="" + if [ -n "${src}" ]; then + datalad clone -d . "${src}" "${mount}" -- --no-checkout + else + echo "ERROR: no submodule URL in .gitmodules for ${mount}" 1>&2 + exit 1 + fi +} + +sparse_checkout_input() { + local mount="$1" shift - [ -n "${repo}" ] && [ -d "${repo}/.git" ] || return 0 - ( cd "${repo}" && \ - git sparse-checkout init --no-cone 2>/dev/null && \ - printf '%s\n' "$@" | git sparse-checkout set --stdin 2>/dev/null ) || true + [ -n "${mount}" ] || return 0 + git -C "${mount}" rev-parse --git-dir >/dev/null 2>&1 || return 0 + ( cd "${mount}" && \ + git sparse-checkout init --no-cone && \ + printf '%s\n' "$@" | git sparse-checkout set --stdin && \ + git checkout -f ) } -# pull down only needed session path and explicit dataset-level metadata: -echo "# Pull down the input session but don't retrieve data contents:" +echo "# Install input datasets (sparse worktrees)" {% for input_dataset in input_datasets %} {% if not input_dataset['is_zipped'] %} -datalad get -n "{{ input_dataset['path_in_babs'] }}/{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}" +ensure_input_submodule "{{ input_dataset['path_in_babs'] }}" +sparse_checkout_input "{{ input_dataset['path_in_babs'] }}" \ + "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}" \ + "dataset_description.json" -datalad get -n "{{ input_dataset['path_in_babs'] }}/dataset_description.json" -{% else %} -datalad get -n "{{ input_dataset['path_in_babs'] }}" +datalad get -n \ + "{{ input_dataset['path_in_babs'] }}/{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}" \ + "{{ input_dataset['path_in_babs'] }}/dataset_description.json" {% endif %} {% endfor %} -{{ zip_locator_text }} - -# Restrict each input subdataset worktree: BIDS → this subject[/session] only (so e.g. pybids -# does not scan other subjects); zipped → only the one zip file for this job (see *_ZIP_REL). -{% for input_dataset in input_datasets %} -{% if not input_dataset['is_zipped'] %} -sparse_subdataset_include_only "{{ input_dataset['path_in_babs'] }}" \ - "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}" \ - "dataset_description.json" -{% else %} +{% if input_datasets | selectattr('is_zipped') | list %} +{% for mount, group in (input_datasets | selectattr('is_zipped') | list | groupby('path_in_babs')) %} +ensure_input_submodule "{{ mount }}" +{% for input_dataset in group %} {% set u = input_dataset['name'] | upper %} -sparse_subdataset_include_only "{{ input_dataset['path_in_babs'] }}" \ - "${%raw%}{{%endraw%}{{ u }}_ZIP_REL{%raw%}}{%endraw%}" +{{ u }}_ZIP_REL="$(find_single_zip_in_git_tree {{ mount }} {{ input_dataset['name'] }})" +{{ u }}_ZIP="{{ mount }}/{% raw %}${{% endraw %}{{ u }}_ZIP_REL{% raw %}}{% endraw %}" +echo 'found {{ input_dataset['name'] }} zipfile:' +echo "${%raw%}{{%endraw%}{{ u }}_ZIP{%raw%}}{%endraw%}" +{% endfor %} +sparse_checkout_input "{{ mount }}" \ +{% for input_dataset in group %} + "${%raw%}{{%endraw%}{{ input_dataset['name'] | upper }}_ZIP_REL{%raw%}}{%endraw%}"{% if not loop.last %} \ {% endif %} {% endfor %} +{% for input_dataset in group %} +{% set u = input_dataset['name'] | upper %} +datalad get -n "${%raw%}{{%endraw%}{{ u }}_ZIP{%raw%}}{%endraw%}" +{% endfor %} +{% endfor %} +{% endif %} + # Link to shared container image so each job does not re-clone the same image. # If shared path is not available (e.g. Slurm Docker workers), retrieve image in this clone. CONTAINER_SHARED="${PROJECT_ROOT}/analysis/containers/.datalad/environments/{{ container_name }}/image" diff --git a/tests/test_generate_submit_script.py b/tests/test_generate_submit_script.py index c488c846..6d0868a3 100644 --- a/tests/test_generate_submit_script.py +++ b/tests/test_generate_submit_script.py @@ -141,31 +141,6 @@ def run_shellcheck(script_path): return False, str(e) -def test_zipped_input_subdataset_sparse_checkout(): - """Zipped input subdatasets get nested sparse-checkout to the resolved zip only. - - ``find_single_zip_in_git_tree`` returns a repo-relative path (``*_ZIP_REL``); the full - path ``*_ZIP`` is composed in shell. ``sparse_subdataset_include_only`` is shared with - non-zipped inputs and receives the same relative paths git expects for sparse-checkout. - """ - config_path = NOTEBOOKS_DIR / 'eg_xcpd-0-10-6_linc.yaml' - config = read_yaml(config_path) - script_content = generate_submit_script( - queue_system='slurm', - cluster_resources_config=config['cluster_resources'], - script_preamble=config['script_preamble'], - job_scratch_directory=config['job_compute_space'], - input_datasets=input_datasets_xcpd, - processing_level='subject', - container_name=config_path.name.split('_')[1], - zip_foldernames=config['zip_foldernames'], - ) - assert 'sparse_subdataset_include_only()' in script_content - assert 'FMRIPREP_ZIP_REL="$(find_single_zip_in_git_tree' in script_content - assert 'FMRIPREP_ZIP="inputs/data/${FMRIPREP_ZIP_REL}"' in script_content - assert 'sparse_subdataset_include_only "inputs/data"' in script_content - - def test_generate_submit_script_pipeline(tmp_path): """Test submit script generation for pipeline configuration.""" # Use same pattern as single-app tests: read from existing YAML config From a9597be7e3369156a5bdd454b4de1400ef645ba5 Mon Sep 17 00:00:00 2001 From: Tien Tong <35613222+tien-tong@users.noreply.github.com> Date: Mon, 23 Mar 2026 15:28:03 -0400 Subject: [PATCH 4/6] fix --- babs/templates/participant_job.sh.jinja2 | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/babs/templates/participant_job.sh.jinja2 b/babs/templates/participant_job.sh.jinja2 index a5ed487c..ae796c45 100644 --- a/babs/templates/participant_job.sh.jinja2 +++ b/babs/templates/participant_job.sh.jinja2 @@ -80,6 +80,11 @@ git checkout -f {{ zip_locator_text }} # Input datasets: ``datalad clone --no-checkout``, sparse paths, ``git checkout -f``. +# +# ensure_input_submodule: if $1 is nonempty and not already a Git repo, clone it as a +# DataLad subdataset using .gitmodules (prefer datalad-url, else url). Plain absolute +# paths are converted to file:// URLs. Uses --no-checkout. No-op if already present; +# exits if no URL is configured. ensure_input_submodule() { local mount="$1" [ -n "${mount}" ] || return 0 @@ -91,6 +96,9 @@ ensure_input_submodule() { || src="$(git config -f .gitmodules --get "submodule.${mount}.url" 2>/dev/null)" \ || src="" if [ -n "${src}" ]; then + case "${src}" in + /*) src="file://${src}" ;; + esac datalad clone -d . "${src}" "${mount}" -- --no-checkout else echo "ERROR: no submodule URL in .gitmodules for ${mount}" 1>&2 @@ -98,6 +106,10 @@ ensure_input_submodule() { fi } +# sparse_checkout_input: if $1 is nonempty and points to a Git repo, enter that repo, +# initialize sparse-checkout in non-cone mode, set the sparse-checkout patterns from the +# remaining arguments (passed one per line to `git sparse-checkout set --stdin`), and then +# force a checkout with `git checkout -f`. No-op if $1 is empty or not a Git repo. sparse_checkout_input() { local mount="$1" shift From 0d48219f26374d68b335e0b788f98cd002696545 Mon Sep 17 00:00:00 2001 From: Tien Tong <35613222+tien-tong@users.noreply.github.com> Date: Mon, 23 Mar 2026 16:52:10 -0400 Subject: [PATCH 5/6] fix --- .../templates/determine_zipfilename.sh.jinja2 | 10 ++++++-- babs/templates/participant_job.sh.jinja2 | 25 +++++++++++-------- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/babs/templates/determine_zipfilename.sh.jinja2 b/babs/templates/determine_zipfilename.sh.jinja2 index 98a48c6e..2edf8199 100644 --- a/babs/templates/determine_zipfilename.sh.jinja2 +++ b/babs/templates/determine_zipfilename.sh.jinja2 @@ -5,14 +5,20 @@ find_single_zip_in_git_tree() {{ '{' }} local name="$2" local hits count - # Bounded listing: ``ls-files --with-tree=HEAD`` supports ``:(glob)`` (works with no-checkout clone); grep applies session filter. + # Bounded listing: ``ls-files --with-tree=HEAD`` supports ``:(glob)`` (works with no-checkout clone). + # Session jobs: require ``_${sesid}_`` in the filename. + # Subject jobs: do not use ``${sesid}`` in the pattern. hits="$( git -C "${zip_search_path}" ls-files --with-tree=HEAD -- \ "${subid}" \ ":(glob)${subid}_*" \ ":(glob)${subid}.*" \ 2>/dev/null | sed '/^$/d' | sort -u \ - | grep -E "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}{% raw %}.*${sesid}{% endraw %}{% endif %}{% raw %}.*${name}.*\.zip${% endraw %}" \ +{% if processing_level == 'session' %} + | grep -E "{% raw %}^${subid}_${sesid}_${name}.*\.zip${% endraw %}" \ +{% else %} + | grep -E "{% raw %}^${subid}_.*${name}.*\.zip${% endraw %}" \ +{% endif %} || true )" diff --git a/babs/templates/participant_job.sh.jinja2 b/babs/templates/participant_job.sh.jinja2 index ae796c45..ccc8e84b 100644 --- a/babs/templates/participant_job.sh.jinja2 +++ b/babs/templates/participant_job.sh.jinja2 @@ -69,9 +69,15 @@ if ! git sparse-checkout init --cone; then exit 1 fi +{% set _sparse_input_paths = input_datasets | map(attribute='path_in_babs') | unique | list %} git sparse-checkout set \ code \ - containers + containers{% if _sparse_input_paths %} \ +{% for p in _sparse_input_paths %} + {{ p }}{% if not loop.last %} \ +{% endif %} +{% endfor %} +{% endif %} git checkout -f @@ -106,19 +112,19 @@ ensure_input_submodule() { fi } -# sparse_checkout_input: if $1 is nonempty and points to a Git repo, enter that repo, -# initialize sparse-checkout in non-cone mode, set the sparse-checkout patterns from the -# remaining arguments (passed one per line to `git sparse-checkout set --stdin`), and then -# force a checkout with `git checkout -f`. No-op if $1 is empty or not a Git repo. +# sparse_checkout_input: if $1 is nonempty and points to a Git repo (e.g. a submodule path +# from .gitmodules such as inputs/data/BIDS), initialize sparse-checkout in non-cone mode, +# set patterns from remaining args (one path per line via --stdin), and checkout -f. +# Uses git -C so cwd stays the job superdataset. No-op if $1 is empty or not a Git repo. sparse_checkout_input() { local mount="$1" shift [ -n "${mount}" ] || return 0 git -C "${mount}" rev-parse --git-dir >/dev/null 2>&1 || return 0 - ( cd "${mount}" && \ - git sparse-checkout init --no-cone && \ - printf '%s\n' "$@" | git sparse-checkout set --stdin && \ - git checkout -f ) + + git -C "${mount}" sparse-checkout init --no-cone + printf '%s\n' "$@" | git -C "${mount}" sparse-checkout set --stdin + git -C "${mount}" checkout -f } echo "# Install input datasets (sparse worktrees)" @@ -159,7 +165,6 @@ datalad get -n "${%raw%}{{%endraw%}{{ u }}_ZIP{%raw%}}{%endraw%}" {% endif %} # Link to shared container image so each job does not re-clone the same image. -# If shared path is not available (e.g. Slurm Docker workers), retrieve image in this clone. CONTAINER_SHARED="${PROJECT_ROOT}/analysis/containers/.datalad/environments/{{ container_name }}/image" CONTAINER_JOB="containers/.datalad/environments/{{ container_name }}/image" From 9677d609f3fa37af2d1816bc60a76753f36a097f Mon Sep 17 00:00:00 2001 From: Tien Tong <35613222+tien-tong@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:52:09 -0400 Subject: [PATCH 6/6] test --- babs/templates/participant_job.sh.jinja2 | 40 +++++++++++++++++------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/babs/templates/participant_job.sh.jinja2 b/babs/templates/participant_job.sh.jinja2 index ccc8e84b..e3f59239 100644 --- a/babs/templates/participant_job.sh.jinja2 +++ b/babs/templates/participant_job.sh.jinja2 @@ -52,7 +52,7 @@ cd "${BRANCH}" # datalad clone the input ria: echo '# Clone the data from input RIA:' -datalad clone "${dssource}" ds -- --no-checkout +datalad clone "${dssource}" ds -- --no-checkout --shared cd ds # set up the result deposition: @@ -87,16 +87,32 @@ git checkout -f # Input datasets: ``datalad clone --no-checkout``, sparse paths, ``git checkout -f``. # -# ensure_input_submodule: if $1 is nonempty and not already a Git repo, clone it as a -# DataLad subdataset using .gitmodules (prefer datalad-url, else url). Plain absolute -# paths are converted to file:// URLs. Uses --no-checkout. No-op if already present; +# is_installed_repo_root: true only when $1 is the root of an installed Git repo +is_installed_repo_root() { + local mount="$1" + local mount_abs top + + [ -n "${mount}" ] || return 1 + [ -d "${mount}" ] || return 1 + + mount_abs="$(cd "${mount}" 2>/dev/null && pwd -P)" || return 1 + top="$(git -C "${mount}" rev-parse --show-toplevel 2>/dev/null)" || return 1 + + [ "${top}" = "${mount_abs}" ] +} + +# ensure_input_submodule: if $1 is nonempty and not already an installed Git repo root, +# clone it as a DataLad subdataset using .gitmodules (prefer datalad-url, else url). Plain +# absolute paths are converted to file:// URLs. Uses --no-checkout. No-op if already present; # exits if no URL is configured. ensure_input_submodule() { local mount="$1" [ -n "${mount}" ] || return 0 - if git -C "${mount}" rev-parse --git-dir >/dev/null 2>&1; then + + if is_installed_repo_root "${mount}"; then return 0 fi + local src src="$(git config -f .gitmodules --get "submodule.${mount}.datalad-url" 2>/dev/null)" \ || src="$(git config -f .gitmodules --get "submodule.${mount}.url" 2>/dev/null)" \ @@ -105,22 +121,24 @@ ensure_input_submodule() { case "${src}" in /*) src="file://${src}" ;; esac - datalad clone -d . "${src}" "${mount}" -- --no-checkout + datalad clone --reckless ephemeral -d . "${src}" "${mount}" -- --no-checkout --shared else echo "ERROR: no submodule URL in .gitmodules for ${mount}" 1>&2 exit 1 fi } -# sparse_checkout_input: if $1 is nonempty and points to a Git repo (e.g. a submodule path -# from .gitmodules such as inputs/data/BIDS), initialize sparse-checkout in non-cone mode, -# set patterns from remaining args (one path per line via --stdin), and checkout -f. -# Uses git -C so cwd stays the job superdataset. No-op if $1 is empty or not a Git repo. +# sparse_checkout_input: if $1 is nonempty and is the root of an installed Git repo +# (for example, a submodule path from .gitmodules such as inputs/data/BIDS), initialize +# sparse-checkout in non-cone mode, set patterns from remaining args (one path per line +# via --stdin), and checkout -f. Uses git -C to not change working directory. +# No-op if $1 is empty or not an installed +# Git repo root. sparse_checkout_input() { local mount="$1" shift [ -n "${mount}" ] || return 0 - git -C "${mount}" rev-parse --git-dir >/dev/null 2>&1 || return 0 + is_installed_repo_root "${mount}" || return 0 git -C "${mount}" sparse-checkout init --no-cone printf '%s\n' "$@" | git -C "${mount}" sparse-checkout set --stdin