diff --git a/cadetrdm/batch_running/case.py b/cadetrdm/batch_running/case.py index ffd3ee0..81c324b 100644 --- a/cadetrdm/batch_running/case.py +++ b/cadetrdm/batch_running/case.py @@ -1,16 +1,17 @@ +from __future__ import annotations + import os import traceback import warnings from pathlib import Path import subprocess -from typing import Dict +from typing import Any # from cadetrdm.container.containerAdapter import ContainerAdapter from cadetrdm.batch_running import Study from cadetrdm.repositories import ProjectRepo from cadetrdm import Options from cadetrdm.environment import Environment -from cadetrdm.logging import LogEntry class Case: @@ -40,15 +41,26 @@ def __init__( project_repo = ProjectRepo(project_repo) self.project_repo = project_repo self.options = options - self.environment = environment - self._options_hash = options.get_hash() - self.results_branch = None + self._current_environment = None + self.environment = environment + self.run_method = run_method + self._results_branch = None + self._results_path = None + def __str__(self): return self.name + @property + def options_hash(self): + return self.options.get_hash() + + @property + def output_repo(self): + return self.project_repo.output_repo + @property def status_file(self): return Path(self.project_repo.path).parent / (Path(self.project_repo.path).name + ".status") @@ -103,89 +115,19 @@ def is_running(self, ): return False - @property - def has_results_for_this_run(self): - self.results_branch = self._get_results_branch() - if self.results_branch is None: - return False - else: - return True - - def _get_results_branch(self): - """ - Search the output log for an entry (i.e. a results branch) with matching study commit hash and options hash. - If environment is given, the environment is also enforced. - - :return: - str: name of the results branch or None. - """ - output_log: Dict[str, LogEntry] = self.project_repo.output_log.entries - options_hash = self.options.get_hash() - study_hash = self.project_repo.current_commit_hash - - found_results_with_incorrect_study_hash = False - found_results_with_incorrect_options = False - found_results_with_incorrect_environment = False - - semi_correct_hits = [] - for output_branch, log_entry in reversed(output_log.items()): - matches_study_hash = log_entry.matches_study_hash(study_hash) - matches_options_hash = log_entry.matches_options_hash(options_hash) - matches_environment = log_entry.fulfils_environment(self.environment) - - if matches_study_hash and matches_options_hash and matches_environment: - return log_entry.output_repo_branch - - elif matches_study_hash and not matches_options_hash and matches_environment: - found_results_with_incorrect_options = True - semi_correct_hits.append( - f"Found matching study commit hash {study_hash[:7]}, but incorrect options hash " - f"(needs: {options_hash[:7]}, has: {log_entry.options_hash[:7]})" - ) - - elif not matches_study_hash and matches_options_hash and matches_environment: - found_results_with_incorrect_study_hash = True - semi_correct_hits.append( - f"Found matching options hash {options_hash[:7]}, but incorrect study commit hash " - f"(needs: {study_hash[:7]}, has: {log_entry.project_repo_commit_hash[:7]})" - ) - elif matches_study_hash and matches_options_hash and not matches_environment: - found_results_with_incorrect_environment = True - semi_correct_hits.append( - f"Found matching options hash {options_hash[:7]}, matching study commit hash " - f"{study_hash[:7]}, but wrong environment specification." - ) - - if found_results_with_incorrect_study_hash: - [print(line) for line in semi_correct_hits] - print( - "No matching results were found for this study version, but results with these options were found for " - "other study versions. Did you recently update the study?" - ) - elif found_results_with_incorrect_options: - [print(line) for line in semi_correct_hits] - print( - "No matching results were found for these options, but results with other options were found for " - "this study versions. Did you recently change the options?" - ) - elif found_results_with_incorrect_environment: - [print(line) for line in semi_correct_hits] - print( - "No matching results were found for this environment, but results with other environments were " - "found for this study versions." - ) - - else: - print("No matching results were found for these options and study version.") - return None - - def run_study(self, force=False, container_adapter: "ContainerAdapter" = None, command: str = None) -> bool: + def run_study( + self, + force: bool = False, + container_adapter: "ContainerAdapter" | None = None, + command: str | None = None, + **load_kwargs: Any, + ) -> Path | None: """ Run specified study commands in the given repository. :returns - boolean indicating if the results for this case are available, either pre-computed or computed now. - + Return path to results for this case if available (either + pre-computed or newly computed), else return None. """ if not force and self.is_running: print(f"{self.project_repo.name} is currently running. Skipping...") @@ -197,14 +139,16 @@ def run_study(self, force=False, container_adapter: "ContainerAdapter" = None, c else: print("WARNING: Not updating the repositories while in debug mode.") - if self.has_results_for_this_run and not force: + results_path = self.load(**load_kwargs) + + if results_path and not force: print(f"{self.project_repo.path} has already been computed with these options. Skipping...") - return True + return results_path if container_adapter is None and self.can_run_study is False: print(f"Current environment does not match required environment. Skipping...") self.status = 'failed' - return False + return try: self.status = 'running' @@ -213,7 +157,7 @@ def run_study(self, force=False, container_adapter: "ContainerAdapter" = None, c log, return_code = container_adapter.run_case(self, command=command) if return_code != 0: self.status = "failed" - return False + return else: module = self.project_repo.module run_method = getattr(module, self.run_method) @@ -221,47 +165,155 @@ def run_study(self, force=False, container_adapter: "ContainerAdapter" = None, c print("Command execution successful.") self.status = 'finished' - return True + results_path = self.load() + return results_path except (KeyboardInterrupt, Exception) as e: traceback.print_exc() self.status = 'failed' - return False + return @property def can_run_study(self) -> bool: - return self.environment is None or self._check_execution_environment() - def _check_execution_environment(self): + return ( + self.environment is None + or + self.current_environment.fulfils_environment(self.environment) + ) + + @property + def current_environment(self): if self._current_environment is None: - existing_environment = subprocess.check_output(f"conda env export", shell=True).decode() - self._current_environment = Environment.from_yml_string(existing_environment) + existing_environment = subprocess.check_output( + f"conda env export", shell=True + ).decode() + self._current_environment = Environment.from_yml_string( + existing_environment + ) - return self._current_environment.fulfils_environment(self.environment) + return self._current_environment @property - def _results_path(self): + def has_results_for_this_run(self): if self.results_branch is None: - return None + return False else: - return self.project_repo.cache_folder_for_branch(self.results_branch) + return True - def load(self, ): - if self.results_branch is None or self.options.get_hash() != self._options_hash: - self.results_branch = self._get_results_branch() - self._options_hash = self.options.get_hash() + @property + def results_branch(self) -> str | None: + return self._get_results_branch() - if self.results_branch is None: - print(f"No results available for Case({self.project_repo.path, self.options.get_hash()[:7]})") - return None + def _get_results_branch( + self, + allow_commit_hash_mismatch: bool = False, + allow_options_hash_mismatch: bool = False, + allow_environment_mismatch: bool = False, + ) -> str | None: + """ + Return the output branch matching the current study and options. - if self._results_path.exists(): - return + Args: + allow_commit_hash_mismatch: If True, allow mismatched study commit hash. + allow_options_hash_mismatch: If True, allow mismatched options hash. + allow_environment_mismatch: If True, allow mismatched environment. - self.project_repo.copy_data_to_cache(self.results_branch) + Returns: + str | None: Name of the results branch, or None if no match found. + """ + options_hash = self.options_hash + commit_hash = self.project_repo.current_commit_hash + log_entries = self.output_repo.output_log.entries + + for output_repo_branch, entry in reversed(log_entries.items()): + environment_ok = entry.fulfils_environment(self.environment) + + # Exact match (all properties match) + if ( + entry.options_hash == options_hash + and entry.project_repo_commit_hash == commit_hash + and environment_ok + ): + return output_repo_branch + + # Check environment + if not allow_environment_mismatch and not environment_ok: + continue + + # Check study hash + if entry.project_repo_commit_hash != commit_hash and not allow_commit_hash_mismatch: + continue + + # Check options hash + if entry.options_hash != options_hash and not allow_options_hash_mismatch: + continue + + # Semi-correct match (at least one mismatch, but allowed) + msg_parts = [] + if entry.options_hash != options_hash: + msg_parts.append( + "mismatched options hash " + f"(needs: {options_hash[:7]}, " + f"has: {entry.options_hash[:7]})" + ) + if entry.project_repo_commit_hash != commit_hash: + msg_parts.append( + "mismatched project repo hash " + f"(needs: {commit_hash[:7]}, " + f"has: {entry.project_repo_commit_hash[:7]})" + ) + if not entry.fulfils_environment(self.environment): + msg_parts.append("mismatched environment") + + if msg_parts: + msg = "Found matching entry with: " + ", ".join(msg_parts) + print(f"{msg}. Using results from branch: {output_repo_branch}") + return output_repo_branch + + print("No matching results found.") @property - def results_path(self): - self.load() + def results_path(self) -> Path | None: + return self.load() + + def load( + self, + allow_commit_hash_mismatch: bool = False, + allow_options_hash_mismatch: bool = False, + allow_environment_mismatch: bool = False, + ) -> Path | None: + """ + Load results for the current case. + + Args: + allow_commit_hash_mismatch: If True, allow loading results with mismatched study commit hash. + allow_options_hash_mismatch: If True, allow loading results with mismatched options hash. + allow_environment_mismatch: If True, allow loading results with mismatched environment. + + Returns: + Path to results. + """ + results_branch = self._get_results_branch( + allow_commit_hash_mismatch=allow_commit_hash_mismatch, + allow_options_hash_mismatch=allow_options_hash_mismatch, + allow_environment_mismatch=allow_environment_mismatch, + ) + + # Early exit if no results are available + if results_branch is None: + print( + f"No results available for Case(" + f"project_repo={self.project_repo.path}, " + f"options_hash={self.options_hash[:7]}" + f")" + ) + return + + # Load results if the path exists, otherwise fetch them + results_path = self.project_repo.copy_data_to_cache(results_branch) + if not results_path.exists(): + print("Failed to fetch results.") + return - return self._results_path + return results_path diff --git a/cadetrdm/initialize_repo.py b/cadetrdm/initialize_repo.py index e9852dd..17727a1 100644 --- a/cadetrdm/initialize_repo.py +++ b/cadetrdm/initialize_repo.py @@ -226,32 +226,49 @@ def initialize_output_repo(output_folder_name, gitignore: list = None, def create_environment_yml(): - file_lines = ["name: rdm_example", "channels:", " - conda-forge", "dependencies:", " - python=3.10", - " - cadet", " - pip", " - pip:", " - cadet-process", " - cadet-rdm"] + file_lines = [ + "name: rdm_example", + "channels:", + " - conda-forge", + "dependencies:", + " - python=3.10", + " - cadet", + " - pip", + " - pip:", + " - cadet-process", + " - cadet-rdm" + ] if not os.path.exists("environment.yml"): write_lines_to_file("environment.yml", file_lines, open_type="w") def create_readme(): - readme_lines = ["## Output Repository", - "", - "The output data for this case study can be found here:", - "[Link to Output Repository]() (not actually set yet because no remote has been configured at this moment)"] + readme_lines = [ + "## Output Repository", + "", + "The output data for this case study can be found here:", + "[Link to Output Repository]() (not actually set yet because no remote has been configured at this moment)" + ] write_lines_to_file("README.md", readme_lines, open_type="a") def create_output_readme(): - readme_lines = ["# Output repository for Example Simulation with CADET", - "This repository stores the simulation results for RDM-Example. `CADET-RDM` automatically tracks all simulations that are started by running `main.py` from the corresponding project repository.", - "", - "Each simulation run creates a dedicated branch in this output repository. The results are saved within the `src` folder of the respective branch. Additionally, a `log.tsv` file in the main branch records metadata for all runs, uniquely linking each output branch to its originating run in the project repository.", - "", - "## Project Repository", - "", - "The project repository for this case study is available here: ", - "[Link to Project Repository]() (not actually set yet because no remote has been configured at this moment)"] + readme_lines = [ + "# Output repository for Example Simulation with CADET", + "This repository stores the simulation results for RDM-Example. " + "`CADET-RDM` automatically tracks all simulations that are started by running `main.py` from the corresponding project repository.", + "", + "Each simulation run creates a dedicated branch in this output repository. " + "The results are saved within the `src` folder of the respective branch. " + "Additionally, a `log.tsv` file in the main branch records metadata for all runs, uniquely linking each output branch to its originating run in the project repository.", + "", + "## Project Repository", + "", + "The project repository for this case study is available here: ", + "[Link to Project Repository]() (not actually set yet because no remote has been configured at this moment)" + ] write_lines_to_file("README.md", readme_lines, open_type="a") def create_dockerfile(): - write_lines_to_file("Dockerfile", dockerfile_template, open_type="w") \ No newline at end of file + write_lines_to_file("Dockerfile", dockerfile_template, open_type="w") diff --git a/cadetrdm/repositories.py b/cadetrdm/repositories.py index 5cac0c8..71f6d65 100644 --- a/cadetrdm/repositories.py +++ b/cadetrdm/repositories.py @@ -1,6 +1,5 @@ import contextlib import csv -import gc import glob import importlib import json @@ -9,6 +8,7 @@ import sys import traceback import warnings +from collections import defaultdict from datetime import datetime from pathlib import Path from stat import S_IREAD, S_IWRITE @@ -823,7 +823,10 @@ def __init__( if not (self.path / self._output_folder).exists(): print("Output repository was missing, cloning now.") self._clone_output_repo() - self.output_repo = OutputRepo(self.path / self._output_folder) + self.output_repo = OutputRepo( + self.path / self._output_folder, + self, + ) if self._metadata["cadet_rdm_version"] != cadetrdm.__version__: self._update_version(self._metadata, cadetrdm.__version__) @@ -1475,6 +1478,15 @@ def capture_error(self, error): class OutputRepo(BaseRepo): + def __init__( + self, + *args: Any, + project_repo: ProjectRepo | None = None, + **kwargs: Any, + ): + self.project_repo = project_repo + super().__init__(*args, **kwargs) + @property def output_log_file_path(self): if not self.active_branch == self.main_branch: @@ -1497,6 +1509,57 @@ def print_output_log(self): self.checkout(self._most_recent_branch) + @property + def project_repo_branches(self) -> set[str]: + """All project repo branches that have been run.""" + return set([ + entry.project_repo_branch + for entry in self.output_log.entries.values() + ]) + + @property + def project_repo_commit_hashes(self) -> set[str]: + """All project repo commit hashes that have been run.""" + return set([ + entry.project_repo_commit_hash + for entry in self.output_log.entries.values() + ]) + + @property + def options_hashes(self) -> set[str]: + """All option hashes that have been run.""" + return set([ + entry.options_hash + for entry in self.output_log.entries.values() + ]) + + @property + def options_to_commit_map(self) -> dict[str, list[str]]: + """ + Maps each option hash to the commit hashes where it was run. + + Returns: + dict: Keys are option hashes, values are lists of commit hashes. + """ + mapping = defaultdict(list) + for entry in self.output_log.entries.values(): + mapping[entry.options_hash].append(entry.project_repo_commit_hash) + return dict(mapping) + + @property + def commit_to_options_map(self) -> dict[str, list[str]]: + """ + Map each commit hash to the option hashes run at that commit. + + Returns: + dict: Keys are commit hashes, values are lists of option hashes. + """ + mapping = defaultdict(list) + for entry in self.output_log.entries.values(): + mapping[entry.project_repo_commit_hash].append(entry.options_hash) + return dict(mapping) + + def add_filetype_to_lfs(self, file_type): """ Add the filetype given in file_type to the GIT-LFS tracking diff --git a/tests/test_batch_runner.py b/tests/test_batch_runner.py index bfc2eac..e01a90c 100644 --- a/tests/test_batch_runner.py +++ b/tests/test_batch_runner.py @@ -3,7 +3,7 @@ import pytest -from cadetrdm import Options, Study, Case, Environment, ProjectRepo, initialize_repo +from cadetrdm import Options, Case, Environment, ProjectRepo, initialize_repo from cadetrdm.io_utils import delete_path @@ -17,7 +17,7 @@ def test_module_import(): rdm_example = ProjectRepo( WORK_DIR / 'template', - "git@jugit.fz-juelich.de:r.jaepel/rdm_example.git", + "git@jugit.fz-juelich.de:IBG-1/ModSim/cadet/rdm_example.git", ) assert hasattr(rdm_example.module, "main") @@ -29,9 +29,9 @@ def test_run_with_non_matching_env(): WORK_DIR = Path.cwd() / "tmp" WORK_DIR.mkdir(parents=True, exist_ok=True) - rdm_example = Study( + rdm_example = ProjectRepo( WORK_DIR / 'template', - "git@jugit.fz-juelich.de:r.jaepel/rdm_example.git", + url="git@jugit.fz-juelich.de:IBG-1/ModSim/cadet/rdm_example.git", ) options = Options() @@ -67,13 +67,13 @@ def test_re_load_results(): from pathlib import Path from cadetrdm import Options - from cadetrdm import Study, Case + from cadetrdm import ProjectRepo, Case WORK_DIR = Path("batch_repos") / "studies" - batch_elution = Study( + batch_elution = ProjectRepo( WORK_DIR / 'batch_elution', - "git@jugit.fz-juelich.de:j.schmoelder/batch_elution.git", + url="git@jugit.fz-juelich.de:j.schmoelder/batch_elution.git", branch="master", ) @@ -97,9 +97,9 @@ def test_results_loading(): WORK_DIR = Path.cwd() / "tmp" WORK_DIR.mkdir(parents=True, exist_ok=True) - rdm_example = Study( + rdm_example = ProjectRepo( WORK_DIR / 'template', - "git@jugit.fz-juelich.de:r.jaepel/rdm_example.git", + url="git@jugit.fz-juelich.de:IBG-1/ModSim/cadet/rdm_example.git", ) class OptionsFixture(Options): @@ -180,7 +180,7 @@ def get_hash(self): def test_case_with_projectrepo(): class OptionsFixture(Options): def get_hash(self): - return "za16jkxf3waxxy3mkavy3jnjn5za0b" + return "4rhaw644xny9e2f75ht8rw1yambyqz7w" path_to_repo = Path("test_repo_batch") if path_to_repo.exists(): @@ -202,7 +202,7 @@ def get_hash(self): def test_results_loading_from_within(): class OptionsFixture(Options): def get_hash(self): - return "za16jkxf3waxxy3mkavy3jnjn5za0b" + return "4rhaw644xny9e2f75ht8rw1yambyqz7w" root_dir = os.getcwd() @@ -212,18 +212,31 @@ def get_hash(self): if (WORK_DIR / 'template').exists(): delete_path(WORK_DIR / 'template') - rdm_example = Study( + rdm_example = ProjectRepo( WORK_DIR / 'template', - "git@jugit.fz-juelich.de:r.jaepel/rdm_example.git", + url="git@jugit.fz-juelich.de:IBG-1/ModSim/cadet/rdm_example.git", + package_dir="template", ) try: os.chdir(WORK_DIR / 'template') - rdm_example = Study(".", "git@jugit.fz-juelich.de:r.jaepel/rdm_example.git") - - case = Case(project_repo=rdm_example, options=OptionsFixture()) - assert case.has_results_for_this_run - assert case.results_branch == '2025-02-11_17-15-38_main_d1842fd' + rdm_example = ProjectRepo( + ".", + url="git@jugit.fz-juelich.de:IBG-1/ModSim/cadet/rdm_example.git", + package_dir="template", + ) + + case = Case( + project_repo=rdm_example, + options=OptionsFixture() + ) + path = case.run_study() + + expected_path = Path('./output_cached/2026-01-14_17-55-25_main_e22b884').absolute() + assert path == expected_path + assert case.results_path == expected_path + + assert case.results_branch == '2026-01-14_17-55-25_main_e22b884' with open(rdm_example.path / "Readme.md", "a") as handle: handle.write("New line\n")