Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions babs/generate_submit_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,9 @@ def generate_submit_script(
varname_taskid = 'SLURM_ARRAY_TASK_ID'
varname_jobid = 'SLURM_ARRAY_JOB_ID'

# If any input dataset is zipped, get the setup for the zipfile locator:
zip_locator_template = env.get_template('determine_zipfilename.sh.jinja2')
zip_locator_text = zip_locator_template.render(
zip_locator_text = env.get_template('determine_zipfilename.sh.jinja2').render(
input_datasets=input_datasets,
processing_level=processing_level,
has_a_zipped_input_dataset=any(
input_dataset['is_zipped'] for input_dataset in input_datasets
),
)

return participant_job_template.render(
Expand Down
27 changes: 16 additions & 11 deletions babs/templates/determine_zipfilename.sh.jinja2
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
{% if input_datasets | selectattr('is_zipped') | list %}
# shellcheck disable=SC1091
{% if has_a_zipped_input_dataset %}
find_single_zip_in_git_tree() {{ '{' }}
local zip_search_path="$1"
local name="$2"
local hits count

# Bounded listing: ``ls-files --with-tree=HEAD`` supports ``:(glob)`` (works with no-checkout clone).
# Session jobs: require ``_${sesid}_`` in the filename.
# Subject jobs: do not use ``${sesid}`` in the pattern.
hits="$(
git -C "${zip_search_path}" ls-tree -r --name-only HEAD \
| grep -E "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}{% raw %}.*${sesid}{% endraw %}{% endif %}{% raw %}.*${name}.*\.zip${% endraw %}" \
git -C "${zip_search_path}" ls-files --with-tree=HEAD -- \
"${subid}" \
":(glob)${subid}_*" \
":(glob)${subid}.*" \
2>/dev/null | sed '/^$/d' | sort -u \
{% if processing_level == 'session' %}
| grep -E "{% raw %}^${subid}_${sesid}_${name}.*\.zip${% endraw %}" \
{% else %}
| grep -E "{% raw %}^${subid}_.*${name}.*\.zip${% endraw %}" \
{% endif %}
|| true
)"

Expand All @@ -20,14 +31,8 @@ find_single_zip_in_git_tree() {{ '{' }}
{% endraw %}
fi

printf "%s/%s\n" "${zip_search_path}" "${hits}"
# Repo-relative path only; caller joins with path_in_babs for absolute path from job clone.
printf '%s\n' "${hits}"
{{ '}' }}

{% for input_dataset in input_datasets %}
{% if input_dataset['is_zipped'] %}
{{ input_dataset['name'].upper() }}_ZIP="$(find_single_zip_in_git_tree {{ input_dataset['path_in_babs'] }} {{ input_dataset['name'] }})"
echo 'found {{ input_dataset['name'] }} zipfile:'
echo "${%raw%}{{%endraw%}{{ input_dataset['name'].upper() }}_ZIP{%raw%}}{%endraw%}"
{% endif %}
{% endfor %}
{% endif %}
117 changes: 96 additions & 21 deletions babs/templates/participant_job.sh.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ cd "${BRANCH}"

# datalad clone the input ria:
echo '# Clone the data from input RIA:'
datalad clone "${dssource}" ds -- --no-checkout
datalad clone "${dssource}" ds -- --no-checkout --shared
cd ds

# set up the result deposition:
Expand All @@ -69,45 +69,120 @@ if ! git sparse-checkout init --cone; then
exit 1
fi

{% set _sparse_input_paths = input_datasets | map(attribute='path_in_babs') | unique | list %}
git sparse-checkout set \
code \
containers \
{% for input_dataset in input_datasets %}
{{ input_dataset['path_in_babs'] }}{% if not loop.last %} \
containers{% if _sparse_input_paths %} \
{% for p in _sparse_input_paths %}
{{ p }}{% if not loop.last %} \
{% endif %}
{% endfor %}
{% endif %}

git checkout -f

# Start of the application-specific code: ------------------------------

# pull down only needed session path and explicit dataset-level metadata:
echo "# Pull down the input session but don't retrieve data contents:"
{{ zip_locator_text }}

# Input datasets: ``datalad clone --no-checkout``, sparse paths, ``git checkout -f``.
#
# is_installed_repo_root: true only when $1 is the root of an installed Git repo
is_installed_repo_root() {
local mount="$1"
local mount_abs top

[ -n "${mount}" ] || return 1
[ -d "${mount}" ] || return 1

mount_abs="$(cd "${mount}" 2>/dev/null && pwd -P)" || return 1
top="$(git -C "${mount}" rev-parse --show-toplevel 2>/dev/null)" || return 1

[ "${top}" = "${mount_abs}" ]
}

# ensure_input_submodule: if $1 is nonempty and not already an installed Git repo root,
# clone it as a DataLad subdataset using .gitmodules (prefer datalad-url, else url). Plain
# absolute paths are converted to file:// URLs. Uses --no-checkout. No-op if already present;
# exits if no URL is configured.
ensure_input_submodule() {
local mount="$1"
[ -n "${mount}" ] || return 0

if is_installed_repo_root "${mount}"; then
return 0
fi

local src
src="$(git config -f .gitmodules --get "submodule.${mount}.datalad-url" 2>/dev/null)" \
|| src="$(git config -f .gitmodules --get "submodule.${mount}.url" 2>/dev/null)" \
|| src=""
if [ -n "${src}" ]; then
case "${src}" in
/*) src="file://${src}" ;;
esac
datalad clone --reckless ephemeral -d . "${src}" "${mount}" -- --no-checkout --shared
else
echo "ERROR: no submodule URL in .gitmodules for ${mount}" 1>&2
exit 1
fi
}

# sparse_checkout_input: if $1 is nonempty and is the root of an installed Git repo
# (for example, a submodule path from .gitmodules such as inputs/data/BIDS), initialize
# sparse-checkout in non-cone mode, set patterns from remaining args (one path per line
# via --stdin), and checkout -f. Uses git -C to not change working directory.
# No-op if $1 is empty or not an installed
# Git repo root.
sparse_checkout_input() {
local mount="$1"
shift
[ -n "${mount}" ] || return 0
is_installed_repo_root "${mount}" || return 0

git -C "${mount}" sparse-checkout init --no-cone
printf '%s\n' "$@" | git -C "${mount}" sparse-checkout set --stdin
git -C "${mount}" checkout -f
}

echo "# Install input datasets (sparse worktrees)"
{% for input_dataset in input_datasets %}
{% if not input_dataset['is_zipped'] %}
datalad get -n "{{ input_dataset['path_in_babs'] }}/{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}"

datalad get -n "{{ input_dataset['path_in_babs'] }}/dataset_description.json"
{% else %}
datalad get -n "{{ input_dataset['path_in_babs'] }}"
ensure_input_submodule "{{ input_dataset['path_in_babs'] }}"
sparse_checkout_input "{{ input_dataset['path_in_babs'] }}" \
"{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}" \
"dataset_description.json"

datalad get -n \
"{{ input_dataset['path_in_babs'] }}/{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}" \
"{{ input_dataset['path_in_babs'] }}/dataset_description.json"
{% endif %}
{% endfor %}

# Restrict each BIDS input subdataset to the current subject so BIDS apps that index the full dataset (e.g. pybids BIDSLayout) don't try to read other subjects' files, which may not be retrieved
{% for input_dataset in input_datasets %}
{% if not input_dataset['is_zipped'] %}
if [ -d "{{ input_dataset['path_in_babs'] }}/.git" ]; then
( cd "{{ input_dataset['path_in_babs'] }}" && \
( git sparse-checkout init --no-cone 2>/dev/null && \
{ echo "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}"; echo 'dataset_description.json'; } | git sparse-checkout set --stdin 2>/dev/null ) ) || true
fi
{% if input_datasets | selectattr('is_zipped') | list %}
{% for mount, group in (input_datasets | selectattr('is_zipped') | list | groupby('path_in_babs')) %}
ensure_input_submodule "{{ mount }}"
{% for input_dataset in group %}
{% set u = input_dataset['name'] | upper %}
{{ u }}_ZIP_REL="$(find_single_zip_in_git_tree {{ mount }} {{ input_dataset['name'] }})"
{{ u }}_ZIP="{{ mount }}/{% raw %}${{% endraw %}{{ u }}_ZIP_REL{% raw %}}{% endraw %}"
echo 'found {{ input_dataset['name'] }} zipfile:'
echo "${%raw%}{{%endraw%}{{ u }}_ZIP{%raw%}}{%endraw%}"
{% endfor %}
sparse_checkout_input "{{ mount }}" \
{% for input_dataset in group %}
"${%raw%}{{%endraw%}{{ input_dataset['name'] | upper }}_ZIP_REL{%raw%}}{%endraw%}"{% if not loop.last %} \
{% endif %}
{% endfor %}

{{ zip_locator_text }}
{% for input_dataset in group %}
{% set u = input_dataset['name'] | upper %}
datalad get -n "${%raw%}{{%endraw%}{{ u }}_ZIP{%raw%}}{%endraw%}"
{% endfor %}
{% endfor %}
{% endif %}

# Link to shared container image so each job does not re-clone the same image.
# If shared path is not available (e.g. Slurm Docker workers), retrieve image in this clone.
CONTAINER_SHARED="${PROJECT_ROOT}/analysis/containers/.datalad/environments/{{ container_name }}/image"
CONTAINER_JOB="containers/.datalad/environments/{{ container_name }}/image"

Expand Down
Loading