diff --git a/babs/generate_submit_script.py b/babs/generate_submit_script.py index c586619d..baa7b88a 100644 --- a/babs/generate_submit_script.py +++ b/babs/generate_submit_script.py @@ -93,14 +93,9 @@ def generate_submit_script( varname_taskid = 'SLURM_ARRAY_TASK_ID' varname_jobid = 'SLURM_ARRAY_JOB_ID' - # If any input dataset is zipped, get the setup for the zipfile locator: - zip_locator_template = env.get_template('determine_zipfilename.sh.jinja2') - zip_locator_text = zip_locator_template.render( + zip_locator_text = env.get_template('determine_zipfilename.sh.jinja2').render( input_datasets=input_datasets, processing_level=processing_level, - has_a_zipped_input_dataset=any( - input_dataset['is_zipped'] for input_dataset in input_datasets - ), ) return participant_job_template.render( diff --git a/babs/templates/determine_zipfilename.sh.jinja2 b/babs/templates/determine_zipfilename.sh.jinja2 index 68214405..2edf8199 100644 --- a/babs/templates/determine_zipfilename.sh.jinja2 +++ b/babs/templates/determine_zipfilename.sh.jinja2 @@ -1,13 +1,24 @@ +{% if input_datasets | selectattr('is_zipped') | list %} # shellcheck disable=SC1091 -{% if has_a_zipped_input_dataset %} find_single_zip_in_git_tree() {{ '{' }} local zip_search_path="$1" local name="$2" local hits count + # Bounded listing: ``ls-files --with-tree=HEAD`` supports ``:(glob)`` (works with no-checkout clone). + # Session jobs: require ``_${sesid}_`` in the filename. + # Subject jobs: do not use ``${sesid}`` in the pattern. hits="$( - git -C "${zip_search_path}" ls-tree -r --name-only HEAD \ - | grep -E "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}{% raw %}.*${sesid}{% endraw %}{% endif %}{% raw %}.*${name}.*\.zip${% endraw %}" \ + git -C "${zip_search_path}" ls-files --with-tree=HEAD -- \ + "${subid}" \ + ":(glob)${subid}_*" \ + ":(glob)${subid}.*" \ + 2>/dev/null | sed '/^$/d' | sort -u \ +{% if processing_level == 'session' %} + | grep -E "{% raw %}^${subid}_${sesid}_${name}.*\.zip${% endraw %}" \ +{% else %} + | grep -E "{% raw %}^${subid}_.*${name}.*\.zip${% endraw %}" \ +{% endif %} || true )" @@ -20,14 +31,8 @@ find_single_zip_in_git_tree() {{ '{' }} {% endraw %} fi - printf "%s/%s\n" "${zip_search_path}" "${hits}" + # Repo-relative path only; caller joins with path_in_babs for absolute path from job clone. + printf '%s\n' "${hits}" {{ '}' }} -{% for input_dataset in input_datasets %} -{% if input_dataset['is_zipped'] %} -{{ input_dataset['name'].upper() }}_ZIP="$(find_single_zip_in_git_tree {{ input_dataset['path_in_babs'] }} {{ input_dataset['name'] }})" -echo 'found {{ input_dataset['name'] }} zipfile:' -echo "${%raw%}{{%endraw%}{{ input_dataset['name'].upper() }}_ZIP{%raw%}}{%endraw%}" -{% endif %} -{% endfor %} {% endif %} diff --git a/babs/templates/participant_job.sh.jinja2 b/babs/templates/participant_job.sh.jinja2 index c6051411..e3f59239 100644 --- a/babs/templates/participant_job.sh.jinja2 +++ b/babs/templates/participant_job.sh.jinja2 @@ -52,7 +52,7 @@ cd "${BRANCH}" # datalad clone the input ria: echo '# Clone the data from input RIA:' -datalad clone "${dssource}" ds -- --no-checkout +datalad clone "${dssource}" ds -- --no-checkout --shared cd ds # set up the result deposition: @@ -69,45 +69,120 @@ if ! git sparse-checkout init --cone; then exit 1 fi +{% set _sparse_input_paths = input_datasets | map(attribute='path_in_babs') | unique | list %} git sparse-checkout set \ code \ - containers \ -{% for input_dataset in input_datasets %} - {{ input_dataset['path_in_babs'] }}{% if not loop.last %} \ + containers{% if _sparse_input_paths %} \ +{% for p in _sparse_input_paths %} + {{ p }}{% if not loop.last %} \ {% endif %} {% endfor %} +{% endif %} git checkout -f # Start of the application-specific code: ------------------------------ -# pull down only needed session path and explicit dataset-level metadata: -echo "# Pull down the input session but don't retrieve data contents:" +{{ zip_locator_text }} + +# Input datasets: ``datalad clone --no-checkout``, sparse paths, ``git checkout -f``. +# +# is_installed_repo_root: true only when $1 is the root of an installed Git repo +is_installed_repo_root() { + local mount="$1" + local mount_abs top + + [ -n "${mount}" ] || return 1 + [ -d "${mount}" ] || return 1 + + mount_abs="$(cd "${mount}" 2>/dev/null && pwd -P)" || return 1 + top="$(git -C "${mount}" rev-parse --show-toplevel 2>/dev/null)" || return 1 + + [ "${top}" = "${mount_abs}" ] +} + +# ensure_input_submodule: if $1 is nonempty and not already an installed Git repo root, +# clone it as a DataLad subdataset using .gitmodules (prefer datalad-url, else url). Plain +# absolute paths are converted to file:// URLs. Uses --no-checkout. No-op if already present; +# exits if no URL is configured. +ensure_input_submodule() { + local mount="$1" + [ -n "${mount}" ] || return 0 + + if is_installed_repo_root "${mount}"; then + return 0 + fi + + local src + src="$(git config -f .gitmodules --get "submodule.${mount}.datalad-url" 2>/dev/null)" \ + || src="$(git config -f .gitmodules --get "submodule.${mount}.url" 2>/dev/null)" \ + || src="" + if [ -n "${src}" ]; then + case "${src}" in + /*) src="file://${src}" ;; + esac + datalad clone --reckless ephemeral -d . "${src}" "${mount}" -- --no-checkout --shared + else + echo "ERROR: no submodule URL in .gitmodules for ${mount}" 1>&2 + exit 1 + fi +} + +# sparse_checkout_input: if $1 is nonempty and is the root of an installed Git repo +# (for example, a submodule path from .gitmodules such as inputs/data/BIDS), initialize +# sparse-checkout in non-cone mode, set patterns from remaining args (one path per line +# via --stdin), and checkout -f. Uses git -C to not change working directory. +# No-op if $1 is empty or not an installed +# Git repo root. +sparse_checkout_input() { + local mount="$1" + shift + [ -n "${mount}" ] || return 0 + is_installed_repo_root "${mount}" || return 0 + + git -C "${mount}" sparse-checkout init --no-cone + printf '%s\n' "$@" | git -C "${mount}" sparse-checkout set --stdin + git -C "${mount}" checkout -f +} + +echo "# Install input datasets (sparse worktrees)" {% for input_dataset in input_datasets %} {% if not input_dataset['is_zipped'] %} -datalad get -n "{{ input_dataset['path_in_babs'] }}/{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}" - -datalad get -n "{{ input_dataset['path_in_babs'] }}/dataset_description.json" -{% else %} -datalad get -n "{{ input_dataset['path_in_babs'] }}" +ensure_input_submodule "{{ input_dataset['path_in_babs'] }}" +sparse_checkout_input "{{ input_dataset['path_in_babs'] }}" \ + "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}" \ + "dataset_description.json" + +datalad get -n \ + "{{ input_dataset['path_in_babs'] }}/{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}" \ + "{{ input_dataset['path_in_babs'] }}/dataset_description.json" {% endif %} {% endfor %} -# Restrict each BIDS input subdataset to the current subject so BIDS apps that index the full dataset (e.g. pybids BIDSLayout) don't try to read other subjects' files, which may not be retrieved -{% for input_dataset in input_datasets %} -{% if not input_dataset['is_zipped'] %} -if [ -d "{{ input_dataset['path_in_babs'] }}/.git" ]; then - ( cd "{{ input_dataset['path_in_babs'] }}" && \ - ( git sparse-checkout init --no-cone 2>/dev/null && \ - { echo "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}"; echo 'dataset_description.json'; } | git sparse-checkout set --stdin 2>/dev/null ) ) || true -fi +{% if input_datasets | selectattr('is_zipped') | list %} +{% for mount, group in (input_datasets | selectattr('is_zipped') | list | groupby('path_in_babs')) %} +ensure_input_submodule "{{ mount }}" +{% for input_dataset in group %} +{% set u = input_dataset['name'] | upper %} +{{ u }}_ZIP_REL="$(find_single_zip_in_git_tree {{ mount }} {{ input_dataset['name'] }})" +{{ u }}_ZIP="{{ mount }}/{% raw %}${{% endraw %}{{ u }}_ZIP_REL{% raw %}}{% endraw %}" +echo 'found {{ input_dataset['name'] }} zipfile:' +echo "${%raw%}{{%endraw%}{{ u }}_ZIP{%raw%}}{%endraw%}" +{% endfor %} +sparse_checkout_input "{{ mount }}" \ +{% for input_dataset in group %} + "${%raw%}{{%endraw%}{{ input_dataset['name'] | upper }}_ZIP_REL{%raw%}}{%endraw%}"{% if not loop.last %} \ {% endif %} {% endfor %} -{{ zip_locator_text }} +{% for input_dataset in group %} +{% set u = input_dataset['name'] | upper %} +datalad get -n "${%raw%}{{%endraw%}{{ u }}_ZIP{%raw%}}{%endraw%}" +{% endfor %} +{% endfor %} +{% endif %} # Link to shared container image so each job does not re-clone the same image. -# If shared path is not available (e.g. Slurm Docker workers), retrieve image in this clone. CONTAINER_SHARED="${PROJECT_ROOT}/analysis/containers/.datalad/environments/{{ container_name }}/image" CONTAINER_JOB="containers/.datalad/environments/{{ container_name }}/image"