From 1be76c83e630cadaf46af717ef157cf0de057c2d Mon Sep 17 00:00:00 2001 From: Jae-Seung Yeom Date: Sat, 24 Jan 2026 21:15:28 -0800 Subject: [PATCH 1/3] Fix PyDyad link in documentation --- docs/getting_started.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 58460ba7..ea6e00f7 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -256,8 +256,8 @@ subdirectory of the install prefix. Python ******* -We offer PyDYAD, a Python binding to the DYAD client library implemented in C. -A producer-consumer example can be found at `tests/pydyad_spsc `_. +We offer `PyDYAD `_, a Python binding to the DYAD client library implemented in C. +A producer-consumer example can be found at `tests/pydyad_spsc `_. .. toctree:: :maxdepth: 1 From 70c55c13a6ca77298a272f673ef36b95b793070c Mon Sep 17 00:00:00 2001 From: Jae-Seung Yeom Date: Sat, 24 Jan 2026 22:27:25 -0800 Subject: [PATCH 2/3] Make a small update to the tutorial instruction and fix cmake module for locating flux --- cmake/modules/FindFluxCore.cmake | 3 ++- docs/demos/SCA26/batch/workflow.sh | 2 +- docs/demos/SCA26/instruction.md | 8 ++++++-- docs/demos/SCA26/setup_env.sh | 8 ++++++++ 4 files changed, 17 insertions(+), 4 deletions(-) create mode 100755 docs/demos/SCA26/setup_env.sh diff --git a/cmake/modules/FindFluxCore.cmake b/cmake/modules/FindFluxCore.cmake index d3be4262..d2ce39ed 100644 --- a/cmake/modules/FindFluxCore.cmake +++ b/cmake/modules/FindFluxCore.cmake @@ -40,9 +40,10 @@ add_library(flux::taskmap ALIAS PkgConfig::FLUX_TASKMAP) if(${FLUX} STREQUAL "FLUX-NOTFOUND") set(FluxCore_FOUND False) else() - find_path(FluxCore_INCLUDE_DIRS flux/core.h PATH_SUFFIXES include/) + find_path(FluxCore_INCLUDE_DIRS flux/core.h PATHS ${FLUX_PREFIX} PATH_SUFFIXES include/) if (NOT IS_DIRECTORY "${FluxCore_INCLUDE_DIRS}") set(FluxCore_FOUND FALSE) + message(STATUS "Cannot find FluxCore include directory") else() message("-- FluxCore_INCLUDE_DIRS: " ${FluxCore_INCLUDE_DIRS}) get_filename_component(FluxCore_ROOT_DIR ${FluxCore_INCLUDE_DIRS}/.. ABSOLUTE) diff --git a/docs/demos/SCA26/batch/workflow.sh b/docs/demos/SCA26/batch/workflow.sh index b13b543c..008a2030 100755 --- a/docs/demos/SCA26/batch/workflow.sh +++ b/docs/demos/SCA26/batch/workflow.sh @@ -21,7 +21,7 @@ fi source ${script_dir}/select_language.sh -if [ "${DYAD_PATH_CONSUMER}" == "" || "${DYAD_PATH_PRODUCER}" == "" ] ; then +if [ "${DYAD_PATH_CONSUMER}" == "" ] || [ "${DYAD_PATH_PRODUCER}" == "" ] ; then echo Undefined environment variables: DYAD_PATH_PRODUCER and DYAD_PATH_CONSUMER exit 1 fi diff --git a/docs/demos/SCA26/instruction.md b/docs/demos/SCA26/instruction.md index f0df64a8..53074b8b 100644 --- a/docs/demos/SCA26/instruction.md +++ b/docs/demos/SCA26/instruction.md @@ -1,5 +1,6 @@ # DYAD Tutorial at SCA/HPCAsia 2026 in Osaka Japan This tutorial is offered as a part of the "Accelerating HPC Application I/O with Fast Node-Local Storage" session. +The material of this tutorial is under [docs/demos/SCA26](https://github.com/flux-framework/dyad/tree/main/docs/demos/SCA26). ## DYAD dependencies - requires: [flux-core](https://github.com/flux-framework/flux-core.git), [jansson](https://github.com/akheron/jansson.git) @@ -25,11 +26,9 @@ spack load mochi-margo # spack env deactivate ``` -pip install flux-python==0.80.0 ## Setup the environment ``` -export DYAD_INSTALL_PREFIX=/home/${USER}/venv module load flux-core mochi-margo # Or # spack env activate dyad @@ -39,6 +38,7 @@ module load flux-core mochi-margo ## Build DYAD ``` +export DYAD_INSTALL_PREFIX=/home/${USER}/venv git clone https://github.com/flux-framework/dyad.git cd dyad; mkdir build; cd build cmake -DDYAD_ENABLE_MARGO_DATA=ON \ @@ -185,9 +185,13 @@ $ squeue 70 compute interact yeom2 R 1:38 2 dsaicn[01-02] ``` +Before continuing, verify that the second terminal is using the same environment +as the first terminal. A script, `setup_env.sh`, is provided to quickly set this +up by running `source ./setup_env.sh`. Then, set up a Flux proxy to connect to the existing instance under that allocation using the commands, [flux uri](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/man1/flux-uri.html) and [flux proxy](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/man1/flux-proxy.html). + ``` $ flux proxy `flux uri slurm:70` $ flux exec -r all hostname diff --git a/docs/demos/SCA26/setup_env.sh b/docs/demos/SCA26/setup_env.sh new file mode 100755 index 00000000..1b36e562 --- /dev/null +++ b/docs/demos/SCA26/setup_env.sh @@ -0,0 +1,8 @@ +module load flux-core mochi-margo + +export DYAD_INSTALL_PREFIX=/home/${USER}/venv +export DYAD_KVS_NAMESPACE=dyad +export DYAD_DTL_MODE=MARGO +export DYAD_PATH_PRODUCER=/mnt/ssd/${USER}/dyad +export DYAD_PATH_CONSUMER=/mnt/ssd/${USER}/dya +source ${DYAD_INSTALL_PREFIX}/bin/activate From 5db61ecc179addc318a6a679182db847daffbd43 Mon Sep 17 00:00:00 2001 From: Jae-Seung Yeom Date: Sun, 25 Jan 2026 00:34:54 -0800 Subject: [PATCH 3/3] Separate DLIO data dir from result dir in the tutorial --- docs/demos/SCA26/instruction.md | 39 +++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/docs/demos/SCA26/instruction.md b/docs/demos/SCA26/instruction.md index 53074b8b..84384ee3 100644 --- a/docs/demos/SCA26/instruction.md +++ b/docs/demos/SCA26/instruction.md @@ -331,7 +331,7 @@ improves I/O performance. In this setup, each trainer acts as both a consumer and a producer. DYAD first checks whether a sample exists in the cache; if not, the sample is loaded from shared storage. -In this example, `${HOME}/demo_DLIO` is defined as the location where training +In this example, `${RESULT_DIR}` is defined as the location where training data and benchmark results are stored. @@ -363,12 +363,19 @@ export DYAD_KVS_NAMESPACE=dyad export DYAD_DTL_MODE=MARGO export DYAD_PATH_PRODUCER=/mnt/ssd/${USER}/dyad export DYAD_PATH_CONSUMER=/mnt/ssd/${USER}/dyad -mkdir -p ${HOME}/demo_DLIO -export PYTHONPATH=${HOME}/demo_DLIO:$PYTHONPATH +mkdir -p ${RESULT_DIR} +export PYTHONPATH=${RESULT_DIR}:$PYTHONPATH export TEST_FILE_SIZE=33554432 export TEST_NUM_FILES=6400 ``` -Copy `dyad/docs/demos/SCA26/DL/dyad_torch_data_loader.py` into `${HOME}/demo_DLIO` +Copy `dyad/docs/demos/SCA26/DL/dyad_torch_data_loader.py` into `${RESULT_DIR}` +We will use pre-generated data under a shared directory, `/home/sca26-tut/share`, +to save time in this tutorial. + +``` +export DATA_DIR=/home/sca26-tut/share/dataset +export RESULT_DIR=${HOME}/demo_DLIO +``` Alllocate compute nodes and start a Flux instance @@ -396,13 +403,13 @@ flux run -N 4 -n 128 dlio_benchmark \ workload=unet3d_a100 \ ++workload.workflow.generate_data=True \ ++workload.workflow.train=False \ - hydra.run.dir=${HOME}/demo_DLIO/output \ - ++workload.output.folder=${HOME}/demo_DLIO/output \ + hydra.run.dir=${RESULT_DIR}/output \ + ++workload.output.folder=${RESULT_DIR}/output \ ++workload.dataset.num_files_train=${TEST_NUM_FILES} \ ++workload.dataset.record_length_bytes=${TEST_FILE_SIZE} \ ++workload.dataset.record_length_bytes_stdev=0 \ - ++workload.dataset.data_folder=${HOME}/demo_DLIO/dataset \ - ++workload.checkpoint.checkpoint_folder=${HOME}/demo_DLIO/checkpoint + ++workload.dataset.data_folder=${DATA_DIR} \ + ++workload.checkpoint.checkpoint_folder=${RESULT_DIR}/checkpoint ``` ### Train without DYAD @@ -412,13 +419,13 @@ flux run -N 4 -n 128 dlio_benchmark \ workload=unet3d_a100 \ ++workload.workflow.generate_data=False \ ++workload.workflow.train=True \ - hydra.run.dir=${HOME}/demo_DLIO/output \ - ++workload.output.folder=${HOME}/demo_DLIO/output \ + hydra.run.dir=${RESULT_DIR}/output \ + ++workload.output.folder=${RESULT_DIR}/output \ ++workload.dataset.num_files_train=${TEST_NUM_FILES} \ ++workload.dataset.record_length_bytes=${TEST_FILE_SIZE} \ ++workload.dataset.record_length_bytes_stdev=0 \ - ++workload.dataset.data_folder=${HOME}/demo_DLIO/dataset \ - ++workload.checkpoint.checkpoint_folder=${HOME}/demo_DLIO/checkpoint \ + ++workload.dataset.data_folder=${DATA_DIR} \ + ++workload.checkpoint.checkpoint_folder=${RESULT_DIR}/checkpoint \ ++workload.reader.batch_size=1 \ ++workload.train.epochs=20 \ ++workload.train.computation_time=0 @@ -431,13 +438,13 @@ flux run -N 4 -n 128 dlio_benchmark \ workload=unet3d_a100 \ ++workload.workflow.generate_data=False \ ++workload.workflow.train=True \ - hydra.run.dir=${HOME}/demo_DLIO/output \ - ++workload.output.folder=${HOME}/demo_DLIO/output \ + hydra.run.dir=${RESULT_DIR}/output \ + ++workload.output.folder=${RESULT_DIR}/output \ ++workload.dataset.num_files_train=${TEST_NUM_FILES} \ ++workload.dataset.record_length_bytes=${TEST_FILE_SIZE} \ ++workload.dataset.record_length_bytes_stdev=0 \ - ++workload.dataset.data_folder=${HOME}/demo_DLIO/dataset \ - ++workload.checkpoint.checkpoint_folder=${HOME}/demo_DLIO/checkpoint \ + ++workload.dataset.data_folder=${DATA_DIR} \ + ++workload.checkpoint.checkpoint_folder=${RESULT_DIR}/checkpoint \ ++workload.reader.batch_size=1 \ ++workload.train.epochs=20 \ ++workload.train.computation_time=0 \