-
Notifications
You must be signed in to change notification settings - Fork 66
FXC-3004-container-aware-web-run-returning-lazy-results #2880
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
FXC-3004-container-aware-web-run-returning-lazy-results #2880
Conversation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
10 files reviewed, 1 comment
c9a5088
to
5217204
Compare
5217204
to
145debb
Compare
Diff CoverageDiff: origin/develop...HEAD, staged and unstaged changes
Summary
tidy3d/web/api/container.pyLines 313-323 313 @property
314 def status(self):
315 """Return current status of :class:`Job`."""
316 if web._is_modeler_batch(self.task_id):
! 317 detail = self.get_info()
! 318 status = detail.totalStatus.value
! 319 return status
320 else:
321 return self.get_info().status
322
323 @property Lines 322-338 322
323 @property
324 def postprocess_status(self):
325 """Return current postprocess status of :class:`Job` if it is a Component Modeler."""
! 326 if web._is_modeler_batch(self.task_id):
! 327 detail = self.get_info()
! 328 return detail.postprocessStatus
329 else:
! 330 log.warning(
331 f"Task ID '{self.task_id}' is not a modeler batch job. "
332 "'postprocess_start' is only applicable to Component Modelers"
333 )
! 334 return
335
336 def start(self, priority: Optional[int] = None) -> None:
337 """Start running a :class:`Job`. Lines 461-480 461 verbose : bool = True
462 Whether to print info messages. This overrides the Job's 'verbose' setting for this call.
463 """
464 # First, confirm that the task is a modeler batch job.
! 465 if not web._is_modeler_batch(self.task_id):
466 # If not, inform the user and exit.
467 # This warning is important and should not be suppressed.
! 468 log.warning(
469 f"Task ID '{self.task_id}' is not a modeler batch job. "
470 "'postprocess_start' is only applicable to Component Modelers"
471 )
! 472 return
473
474 # If it is a modeler batch, call the dedicated function to start postprocessing.
475 # The verbosity is a combination of the job's setting and the method's parameter.
! 476 web.postprocess_start(
477 batch_id=self.task_id, verbose=(self.verbose and verbose), worker_group=worker_group
478 )
479
480 @staticmethod Lines 910-923 910 The specific worker group to run the postprocessing tasks on.
911 verbose : bool = True
912 Whether to print info messages.
913 """
! 914 if self.verbose and verbose:
! 915 console = get_logging_console()
! 916 console.log("Attempting to start postprocessing for jobs in the batch.")
917
! 918 for job in self.jobs.values():
! 919 job.postprocess_start(worker_group=worker_group, verbose=verbose)
920
921 def monitor(self) -> None:
922 """
923 Monitor progress of each of the running tasks. Lines 936-952 936 # For regular jobs, finish when the status is in an end state.
937 return status not in END_STATES
938
939 # For modeler jobs, the logic is more complex.
! 940 if status == "run_success":
! 941 condition = job.postprocess_status not in END_STATES
! 942 return condition
943
! 944 if status not in END_STATES:
! 945 return True # Still running, definitely continue.
946
947 # If status is a final end state (e.g., 'success', 'error'), we are done.
! 948 return False
949
950 def pbar_description(
951 task_name: str, status: str, max_name_length: int, status_width: int
952 ) -> str: Lines 961-971 961 if status in ERROR_STATES:
962 status_part = f"→ [red]{status:<{status_width}}"
963 elif status in COMPLETED_STATES:
964 status_part = f"→ [green]{status:<{status_width}}"
! 965 elif status in (PRE_ERROR_STATES | PRE_VALIDATE_STATES):
966 status_part = f"→ [yellow]{status:<{status_width}}"
! 967 elif status in RUNNING_STATES:
968 status_part = f"→ [blue]{status:<{status_width}}"
969 else:
970 status_part = f"→ {status:<{status_width}}" Lines 1007-1036 1007
1008 while any(check_continue_condition(job) for job in self.jobs.values()):
1009 for task_name, job in self.jobs.items():
1010 status = job.status
! 1011 if (
1012 web._is_modeler_batch(job.task_id)
1013 and status == "run_success"
1014 and job.task_id not in postprocess_started_tasks
1015 ):
! 1016 job.postprocess_start(verbose=True)
! 1017 postprocess_started_tasks.add(job.task_id)
1018
1019 # Update progress bar
! 1020 pbar = pbar_tasks[task_name]
! 1021 if status != "run_success":
! 1022 completed_percent = STATE_PROGRESS_PERCENTAGE[status]
! 1023 elif status == "run_success":
! 1024 postprocess_status = job.postprocess_status
! 1025 if postprocess_status in END_STATES:
! 1026 status = postprocess_status
! 1027 completed_percent = STATE_PROGRESS_PERCENTAGE[postprocess_status]
1028 else:
! 1029 status = "postprocess"
! 1030 completed_percent = STATE_PROGRESS_PERCENTAGE["postprocess"]
! 1031 description = pbar_description(task_name, status, max_name_length, 0)
! 1032 progress.update(pbar, description=description, completed=completed_percent)
1033
1034 progress.refresh()
1035 time.sleep(BATCH_MONITOR_PROGRESS_REFRESH_TIME) Lines 1038-1053 1038 for task_name, job in self.jobs.items():
1039 status = job.status
1040 if status != "run_success":
1041 completed_percent = STATE_PROGRESS_PERCENTAGE[status]
! 1042 elif status == "run_success":
! 1043 postprocess_status = job.postprocess_status
! 1044 if postprocess_status in END_STATES:
! 1045 status = postprocess_status
! 1046 completed_percent = STATE_PROGRESS_PERCENTAGE[postprocess_status]
1047 else:
! 1048 status = "postprocess"
! 1049 completed_percent = STATE_PROGRESS_PERCENTAGE["postprocess"]
1050 pbar = pbar_tasks[task_name]
1051 description = pbar_description(task_name, status, max_name_length, 0)
1052
1053 progress.update(pbar, description=description, completed=completed_percent) Lines 1056-1073 1056 console.log("Batch complete.")
1057
1058 else:
1059 # Non-verbose path
! 1060 while any(check_continue_condition(job) for job in self.jobs.values()):
! 1061 for job in self.jobs.values():
1062 # If a modeler job is done running, start its postprocessing once.
! 1063 if (
1064 web._is_modeler_batch(job.task_id)
1065 and job.status == "run_success"
1066 and job.task_id not in postprocess_started_tasks
1067 ):
! 1068 job.postprocess_start(verbose=False)
! 1069 postprocess_started_tasks.add(job.task_id)
1070 time.sleep(web.REFRESH_TIME)
1071
1072 @staticmethod
1073 def _job_data_path(task_id: TaskId, path_dir: str = DEFAULT_DATA_DIR): tidy3d/web/api/run.pyLines 42-54 42 _collect_by_hash(v, found)
43 return found
44 if isinstance(node, dict):
45 if any(isinstance(k, WorkflowType) for k in node.keys()):
! 46 raise ValueError("Dict keys must not be simulations.")
47 for v in node.values():
48 _collect_by_hash(v, found)
49 return found
! 50 raise TypeError(f"Unsupported element in container: {type(node)!r}")
51
52
53 def _reconstruct_by_hash(node: RunInput, h2data: dict[str, WorkflowDataType]) -> RunOutput:
54 """Replaces each leaf node (simulation) with its corresponding data object. Lines 72-80 72 if isinstance(n, list):
73 return [_recur(v) for v in n]
74 if isinstance(n, dict):
75 return {k: _recur(v) for k, v in n.items()}
! 76 raise TypeError(f"Unsupported element in reconstruction: {type(n)!r}")
77
78 return _recur(node)
79 Lines 217-225 217 Underlying autograd batch submission implementation.
218 """
219 h2sim: dict[str, WorkflowType] = _collect_by_hash(simulation)
220 if not h2sim:
! 221 raise ValueError("No simulation data found in simulation input.")
222
223 key_prefix = ""
224 if len(h2sim) == 1:
225 path = path if path is not None else DEFAULT_DATA_PATH tidy3d/web/api/webapi.pyLines 495-504 495 ValueError
496 If no task is found for the given ``task_id``.
497 """
498 if _is_modeler_batch(task_id):
! 499 batch = BatchTask(task_id)
! 500 return batch.detail(batch_type="RF_SWEEP")
501 else:
502 task = SimulationTask.get(task_id, verbose)
503 if not task:
504 raise ValueError("Task not found.") Lines 545-555 545 batch = BatchTask(task_id)
546 detail = batch.wait_for_validate(batch_type="RF_SWEEP")
547 status = detail.totalStatus
548 status_str = status.value
! 549 if status_str in POST_VALIDATE_STATES:
! 550 pass
! 551 elif status_str not in POST_VALIDATE_STATES:
552 raise WebError(f"Batch task {task_id} is blocked: {status_str}")
553 # Submit batch to start runs after validation
554 batch.submit(
555 solver_version=solver_version, batch_type="RF_SWEEP", worker_group=worker_group Lines 553-562 553 # Submit batch to start runs after validation
554 batch.submit(
555 solver_version=solver_version, batch_type="RF_SWEEP", worker_group=worker_group
556 )
! 557 if verbose:
! 558 console.log(f"Component Modeler '{task_id}' validation succeeded. Starting to solve...")
559 return
560
561 if priority is not None and (priority < 1 or priority > 10):
562 raise ValueError("Priority must be between '1' and '10' if specified.") Lines 600-642 600 ----------
601 task_id : str
602 Unique identifier of task on server. Returned by :meth:`upload`.
603 """
! 604 if _is_modeler_batch(task_id):
605 # split (modeler-specific)
! 606 batch = BatchTask(task_id)
! 607 detail = batch.detail(batch_type="RF_SWEEP")
! 608 status = detail.totalStatus
! 609 if status == "visualize":
! 610 return "success"
! 611 if status in ERROR_STATES:
! 612 try:
613 # TODO Try to obtain the error message
! 614 pass
! 615 except Exception:
616 # If the error message could not be obtained, raise a generic error message
! 617 error_msg = "Error message could not be obtained, please contact customer support."
618
! 619 raise WebError(f"Error running task {task_id}! {error_msg}")
620 else:
! 621 task_info = get_info(task_id)
! 622 status = task_info.status
! 623 if status == "visualize":
! 624 return "success"
! 625 if status in ERROR_STATES:
! 626 try:
627 # Try to obtain the error message
! 628 task = SimulationTask(taskId=task_id)
! 629 with tempfile.NamedTemporaryFile(suffix=".json") as tmp_file:
! 630 task.get_error_json(to_file=tmp_file.name)
! 631 with open(tmp_file.name) as f:
! 632 error_content = json.load(f)
! 633 error_msg = error_content["msg"]
! 634 except Exception:
635 # If the error message could not be obtained, raise a generic error message
! 636 error_msg = "Error message could not be obtained, please contact customer support."
637
! 638 raise WebError(f"Error running task {task_id}! {error_msg}")
639 return status
640
641
642 def monitor(task_id: TaskId, verbose: bool = True, worker_group: Optional[str] = None) -> None: Lines 806-814 806 if task_type in GUI_SUPPORTED_TASK_TYPES:
807 url = _get_url(task_id)
808 console.log(f"View simulation result at [blue underline][link={url}]'{url}'[/link].")
809 else:
! 810 while get_status(task_id) not in END_STATES:
811 time.sleep(REFRESH_TIME)
812
813
814 @wait_for_connection Lines 900-908 900 total = resp.totalTask or 0
901 post_succ = resp.postprocessSuccess or 0
902 status = resp.totalStatus
903 status_str = status.value
! 904 if status_str in ERROR_STATES:
905 raise WebError(
906 f"Batch task {task_id} failed during postprocess: {status_str}"
907 ) from None
908 if total > 0 and post_succ >= total: Lines 1057-1068 1057 """
1058 # For component modeler batches, default to a clearer filename if the default was used.
1059 if _is_modeler_batch(task_id):
1060 base_dir = os.path.dirname(path) or "."
! 1061 if os.path.basename(path) == "simulation_data.hdf5":
! 1062 path = os.path.join(base_dir, "cm_data.hdf5")
! 1063 elif os.path.basename(path) == "simulation_data.hdf5.gz":
! 1064 path = os.path.join(base_dir, "cm_data.hdf5.gz")
1065
1066 if not os.path.exists(path) or replace_existing:
1067 download(task_id=task_id, path=path, verbose=verbose, progress_callback=progress_callback) Lines 1125-1140 1125 d = _batch_detail(batch_id)
1126 s = d.totalStatus.value
1127 total = d.totalTask or 0
1128 r = d.runSuccess or 0
! 1129 if s in ERROR_STATES:
1130 raise WebError(f"Batch {batch_id} terminated: {s}")
1131 # Updated break condition for robustness
! 1132 if s in ("run_success", "success") or (total and r >= total):
1133 break
1134 time.sleep(REFRESH_TIME)
1135
! 1136 postprocess_start(batch_id, verbose=False, worker_group=worker_group)
1137
1138 while True:
1139 d = _batch_detail(batch_id)
1140 postprocess_status = d.postprocessStatus Lines 1139-1147 1139 d = _batch_detail(batch_id)
1140 postprocess_status = d.postprocessStatus
1141 if postprocess_status == "success":
1142 break
! 1143 elif postprocess_status in ERROR_STATES:
1144 raise WebError(
1145 f"Batch {batch_id} terminated. Please contact customer support and provide this Component Modeler batch ID: '{batch_id}'"
1146 )
1147 time.sleep(REFRESH_TIME) Lines 1201-1209 1201 continue
1202 tstatus = (t.status or "draft").lower()
1203 _, idx = _status_to_stage(tstatus)
1204 desc = f" {tname} [{tstatus or 'draft'}]"
! 1205 progress.update(
1206 pbar,
1207 completed=STATE_PROGRESS_PERCENTAGE[tstatus] / 100,
1208 description=desc,
1209 refresh=False, Lines 1209-1224 1209 refresh=False,
1210 )
1211
1212 # Updated break condition for robustness
! 1213 if status in ("run_success", "success") or (total and r >= total):
1214 break
! 1215 if status in ERROR_STATES:
1216 raise WebError(f"Batch {batch_id} terminated: {status}")
1217 progress.refresh()
1218 time.sleep(REFRESH_TIME)
1219
! 1220 postprocess_start(batch_id, verbose=True, worker_group=worker_group)
1221
1222 p_post = progress.add_task("Postprocess", total=1.0)
1223 while True:
1224 detail = _batch_detail(batch_id) Lines 1232-1240 1232 elif postprocess_status == "preprocess":
1233 progress.update(p_post, completed=0.33)
1234 elif postprocess_status == "running":
1235 progress.update(p_post, completed=0.55)
! 1236 elif postprocess_status in ERROR_STATES:
1237 raise WebError(
1238 f"Batch {batch_id} terminated. Please contact customer support and provide this Component Modeler batch ID: '{batch_id}'"
1239 )
1240 progress.refresh() Lines 1395-1422 1395 if _is_modeler_batch(task_id):
1396 d = _batch_detail(task_id)
1397 status = d.totalStatus.value
1398
! 1399 if status in ALL_POST_VALIDATE_STATES:
! 1400 est_flex_unit = _batch_detail(task_id).estFlexUnit
! 1401 if verbose:
! 1402 console.log(
1403 f"Maximum FlexCredit cost: {est_flex_unit:1.3f}. Minimum cost depends on "
1404 "task execution details. Use 'web.real_cost(task_id)' to get the billed FlexCredit "
1405 "cost after a simulation run."
1406 )
! 1407 return est_flex_unit
1408
! 1409 elif status in ERROR_STATES:
! 1410 log.error(f"The ComponentModeler '{task_id}' has failed: {status}")
1411
! 1412 if status == "validate_fail":
! 1413 assert d.validateErrors is not None
! 1414 for key, error in d.validateErrors.items():
1415 # I don't like this ideally but would like to control the endpoint to make this better
! 1416 error_dict = json.loads(error)
! 1417 validation_error = error_dict["validation_error"]
! 1418 log.error(
1419 f"Subtask '{key}' has failed to validate:"
1420 f" \n {validation_error} \n "
1421 f"Fix your component modeler configuration. "
1422 f"Generate subtask simulations locally using `ComponentModelerType.sim_dict`." Lines 1421-1429 1421 f"Fix your component modeler configuration. "
1422 f"Generate subtask simulations locally using `ComponentModelerType.sim_dict`."
1423 )
1424
! 1425 raise WebError("Could not get estimated cost!")
1426
1427 else:
1428 task = SimulationTask.get(task_id) Lines 1456-1465 1456 f" {fc_post:1.3f} FlexCredit of the total cost from post-processing."
1457 )
1458 return task_info.estFlexUnit
1459
! 1460 elif status in ERROR_STATES:
! 1461 log.error(f"The task '{task_id}' has failed: {status}")
1462
1463 # Something went wrong
1464 raise WebError("Could not get estimated cost!") Lines 1622-1656 1622 Checks if a batch run is complete and starts the postprocess phase.
1623
1624 This function does not wait for postprocessing to finish.
1625 """
! 1626 console = get_logging_console() if verbose else None
! 1627 if _is_modeler_batch(batch_id):
1628 # Perform a single check on the run phase status
! 1629 detail = _batch_detail(batch_id)
! 1630 status = detail.totalStatus.value
! 1631 total_tasks = detail.totalTask or 0
! 1632 successful_runs = detail.runSuccess or 0
1633
! 1634 if status in ERROR_STATES:
! 1635 raise WebError(f"Batch '{batch_id}' terminated with error status: {status}")
1636
1637 # Check if the run phase is complete before proceeding
! 1638 is_run_complete = status in ("run_success", "success") or (
1639 total_tasks > 0 and successful_runs >= total_tasks
1640 )
1641
! 1642 if not is_run_complete:
! 1643 if console:
! 1644 console.log(
1645 f"Batch '{batch_id}' run phase is not yet complete (Status: {status}). "
1646 f"Cannot start postprocessing."
1647 )
! 1648 return # Exit if the run is not done
! 1649 BatchTask(batch_id).postprocess(batch_type="RF_SWEEP", worker_group=worker_group)
! 1650 return
1651 else:
! 1652 raise WebError(
1653 f"Batch ID '{batch_id}' is not a component modeler batch job. "
1654 "'postprocess_start' is only applicable to those classes."
1655 ) tidy3d/web/core/file_util.pyLines 41-53 41 def read_simulation_from_hdf5_gz(file_name: str) -> str:
42 """read simulation str from hdf5.gz"""
43
44 hdf5_file, hdf5_file_path = tempfile.mkstemp(".hdf5")
! 45 os.close(hdf5_file)
46 try:
47 extract_gzip_file(file_name, hdf5_file_path)
48 # Pass the uncompressed temporary file path to the reader
! 49 json_str = read_simulation_from_hdf5(hdf5_file_path)
50 finally:
51 os.unlink(hdf5_file_path)
52 return json_str |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @marcorudolphflex this is great! I left a couple of comments, nothing too major.
145debb
to
e03175a
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Additional Comments (1)
-
tidy3d/plugins/smatrix/run.py
, line 160-181 (link)syntax: Docstring describes the old implementation and mentions returning
ComponentModelerDataType
but the function now returnstuple[SimulationMap, dict[str, Any]]
Context Used: Rule from
dashboard
- Keep docstrings and comments synchronized with code changes to ensure they are always accurate and n... (source)
12 files reviewed, 3 comments
853a2b0
to
337db93
Compare
19584dd
to
f2343ac
Compare
f2343ac
to
bf28c60
Compare
2d62641
to
f3d8a21
Compare
Co-authored-by: daquinteroflex <dario@flexcompute.com>
f3d8a21
to
d849b93
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Awesome, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
12 files reviewed, no comments
Goal
Extend web.run (and only the surfaces that already wrap it - autograd entry points, CLI helpers, Job/Batch, run_async) so the API accepts arbitrary Python containers and returns SimulationData lazily in the same structure, relying on the lazy-load.
Implementation
Open questions:
If same simulation is submitted multiple times, the sim_data objects of these simulations are the same - should we make copies?
Greptile Overview
Updated On: 2025-10-16 19:47:25 UTC
Greptile Summary
This PR implements a major enhancement to the
web.run()
API, making it container-aware so it can accept arbitrary Python containers (lists, tuples, dictionaries) containing simulation objects and return results in the same nested structure. The implementation introduces a unified interface that maintains backward compatibility while adding powerful batch processing capabilities.The core change is a new
tidy3d/web/api/run.py
module that serves as a container-aware wrapper. This wrapper intelligently delegates to the existing legacyrun
function for single simulations and torun_async
for multiple simulations. The implementation uses hash-based deduplication to avoid running identical simulations multiple times, optimizing performance when the same simulation appears in different parts of the input structure.A new
lazy
parameter has been added throughout the web API stack, allowing users to control whether simulation data is loaded immediately (eager) or deferred until first access (lazy). The lazy loading behavior defaults toFalse
for single simulations (maintaining existing behavior) andTrue
for batch runs (more efficient for large operations).The changes extend across multiple layers including the main web interface, autograd entry points, container classes (Job, Batch, BatchData), and asynchronous operations. Path parameter interpretation has also been enhanced to treat paths as directories for multiple runs and filenames for single runs, providing intuitive behavior based on input type.
Important Files Changed
Changed Files
tidy3d/web/api/run.py
tidy3d/web/api/container.py
tests/test_web/test_webapi.py
tidy3d/web/api/webapi.py
tidy3d/web/api/asynchronous.py
tidy3d/web/api/autograd/autograd.py
tidy3d/web/api/autograd/engine.py
tidy3d/web/__init__.py
tests/test_web/test_webapi_mode.py
CHANGELOG.md
tidy3d/plugins/smatrix/run.py
tidy3d/packaging.py
Confidence score: 3/5
tidy3d/web/api/run.py
which contains debugging print statements that violate coding standardsSequence Diagram
Context used:
dashboard
- Remove temporary debugging code (print() calls), commented-out code, and other workarounds before fi... (source)