diff --git a/datalad_next/patches/run.py b/datalad_next/patches/run.py index b7672d7a..6348c1bc 100644 --- a/datalad_next/patches/run.py +++ b/datalad_next/patches/run.py @@ -1,4 +1,10 @@ -"""Enhance ``run()`` placeholder substitutions to honor configuration defaults +"""Enhance datalad-core's ``run()`` + +Portable path handling logic for run-records +-------------------------------------------- + +Placeholder substitutions to honor configuration defaults +--------------------------------------------------------- Previously, ``run()`` would not recognize configuration defaults for placeholder substitution. This means that any placeholders globally declared in @@ -19,6 +25,12 @@ """ from itertools import filterfalse +from os.path import lexists +from pathlib import ( + PurePath, + PureWindowsPath, + PurePosixPath, +) import sys from datalad.core.local.run import ( @@ -26,7 +38,10 @@ SequenceFormatter, normalize_command, quote_cmdlinearg, + _create_record as _orig_create_record, ) +from datalad.distribution.dataset import Dataset +from datalad.local.rerun import get_run_info as _orig_get_run_info from datalad.interface.common_cfg import definitions as cfg_defs from datalad.support.constraints import EnsureStr from datalad.support.extensions import register_config @@ -34,6 +49,95 @@ from . import apply_patch +# Deals with https://github.com/datalad/datalad/issues/7512 +def _create_record(run_info, sidecar_flag, ds): + # convert any input/output specification to a POSIX path + for k in ('inputs', 'outputs'): + if k not in run_info: + continue + run_info[k] = [_get_posix_relpath_for_runrecord(p) + for p in run_info[k]] + + return _orig_create_record(run_info, sidecar_flag, ds) + + +def _get_posix_relpath_for_runrecord(path): + p = PurePath(path) + if p.is_absolute(): + # there is no point in converting an absolute path + # to a different platform convention. + # return as-is + return path + + return str(PurePosixPath(p)) + + +# Deals with https://github.com/datalad/datalad/issues/7512 +def get_run_info(dset, message): + msg, run_info = _orig_get_run_info(dset, message) + if run_info is None: + # nothing to process, return as-is + return msg, run_info + + for k in ('inputs', 'outputs'): + if k not in run_info: + continue + run_info[k] = [_get_platform_path_from_runrecord(p, dset) + for p in run_info[k]] + return msg, run_info + + +def _get_platform_path_from_runrecord(path: str, ds: Dataset) -> PurePath: + """Helper to standardize run_info path handling + + Previously, run-records would contain platform-paths (e.g., windows paths + when added on windows, POSIX paths elsewhere). This made cross-platform + rerun impossible out-of-the box, but it also means that such dataset are + out there in unknown numbers. + + This helper inspects any input/output path reported by get_run_info() + and tries to ensure that it matches platform conventions. + + Parameters + ---------- + path: str + A str-path from an input/output specification + ds: Dataset + This dataset's base path is used for existence testing for + convention determination. + + Returns + ------- + str + """ + # we only need to act differently, when an incoming path is + # windows. This is not possible to say with 100% confidence, + # because a POSIX path can also contain a backslash. We support + # a few standard cases where we CAN tell + try: + pathobj = None + if '\\' not in path: + # no windows pathsep, no problem + pathobj = PurePosixPath(path) + # let's assume it is windows for a moment + elif lexists(str(ds.pathobj / PureWindowsPath(path))): + # if there is something on the filesystem for this path, + # we can be reasonably sure that this is indeed a windows + # path. This won't catch everything, but better than nothing + pathobj = PureWindowsPath(path) + else: + # if we get here, we have no idea, and no means to verify + # further hypotheses -- go with the POSIX assumption + # and hope for the best + pathobj = PurePosixPath(path) + assert pathobj is not None + except Exception: + return path + + # we report in platform-conventions + return str(PurePath(pathobj)) + + # This function is taken from datalad-core@a96c51c0b2794b2a2b4432ec7bd51f260cb91a37 # datalad/core/local/run.py # The change has been proposed in https://github.com/datalad/datalad/pull/7509 @@ -80,6 +184,11 @@ def not_subst(x): apply_patch( 'datalad.core.local.run', None, 'format_command', format_command) +apply_patch( + 'datalad.core.local.run', None, '_create_record', _create_record) +apply_patch( + 'datalad.local.rerun', None, 'get_run_info', get_run_info) + register_config( 'datalad.run.substitutions.python', 'Substitution for {python} placeholder', diff --git a/datalad_next/patches/tests/test_run.py b/datalad_next/patches/tests/test_run.py index 721e6de9..87c545c5 100644 --- a/datalad_next/patches/tests/test_run.py +++ b/datalad_next/patches/tests/test_run.py @@ -1,5 +1,8 @@ +from pathlib import Path import pytest +from datalad.local.rerun import get_run_info + from datalad_next.exceptions import IncompleteResultsError from datalad_next.tests.utils import ( SkipTest, @@ -23,3 +26,88 @@ def test_substitution_config_default(existing_dataset): # make sure we could actually detect breakage with the check above with pytest.raises(IncompleteResultsError): ds.run('{python} -c "breakage"', result_renderer='disabled') + + +def test_runrecord_portable_paths(existing_dataset): + ds = existing_dataset + dsrepo = ds.repo + infile = ds.pathobj / 'inputs' / 'testfile.txt' + outfile = ds.pathobj / 'outputs' / 'testfile.txt' + infile.parent.mkdir() + outfile.parent.mkdir() + infile.touch() + ds.save() + assert not outfile.exists() + # script copies any 'inputs' to the outputs dir + res = ds.run( + '{python} -c "' + 'from shutil import copyfile;' + 'from pathlib import Path;' + r"""[copyfile(f.strip('\"'), Path.cwd() / \"outputs\" / Path(f.strip('\"')).name)""" + # we need to use a raw string to contain the inputs expansion, + # on windows they would contain backslashes that are unescaped + r""" for f in r'{inputs}'.split()]""" + '"', + result_renderer='disabled', + # we need to pass relative paths ourselves + # https://github.com/datalad/datalad/issues/7516 + inputs=[str(infile.relative_to(ds.pathobj))], + outputs=[str(outfile.relative_to(ds.pathobj))], + ) + # verify basic outcome + assert_result_count(res, 1, action='run', status='ok') + assert outfile.exists() + + # branch we expect the runrecord on + branch = dsrepo.get_corresponding_branch() or dsrepo.get_active_branch() + cmsg = dsrepo.format_commit('%B', branch) + + # the IOspecs are stored in POSIX conventions + assert r'"inputs/testfile.txt"' in cmsg + assert r'"outputs/testfile.txt"' in cmsg + + # get_run_info() reports in platform conventions + msg, run_info = get_run_info(ds, cmsg) + assert run_info + for k in ('inputs', 'outputs'): + specs = run_info.get(k) + assert len(specs) > 0 + for p in specs: + assert (ds.pathobj / p).exists() + + +def test_runrecord_oldnative_paths(existing_dataset): + ds = existing_dataset + # this test is imitating a rerun situation, so we create the + # inputs and outputs + infile = ds.pathobj / 'inputs' / 'testfile.txt' + outfile = ds.pathobj / 'outputs' / 'testfile.txt' + for fp in (infile, outfile): + fp.parent.mkdir() + fp.touch() + + cmsg = ( + '[DATALAD RUNCMD] /home/mih/env/datalad-dev/bin/python -c ...\n\n' + '=== Do not change lines below ===\n' + '{\n' + ' "chain": [],\n' + ' "cmd": "{python} -c True",\n' + # use the ID of the test dataset to ensure proper association + f' "dsid": "{ds.id}",\n' + ' "exit": 0,\n' + ' "extra_inputs": [],\n' + ' "inputs": [\n' + # make windows path, used to be stored in escaped form + r' "inputs\\testfile.txt"' '\n' + ' ],\n' + ' "outputs": [\n' + # make windows path, used to be stored in escaped form + r' "outputs\\testfile.txt"' '\n' + ' ],\n' + ' "pwd": "."\n' + '}\n' + '^^^ Do not change lines above ^^^\n' + ) + msg, run_info = get_run_info(ds, cmsg) + assert run_info['inputs'][0] == str(Path('inputs', 'testfile.txt')) + assert run_info['outputs'][0] == str(Path('outputs', 'testfile.txt'))