From f1d9e21b05ecbccc426863291ff42c17e69fba2e Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Mon, 8 Apr 2024 13:36:24 -0700 Subject: [PATCH 01/15] dockerfile --- habana/Dockerfile.habana1130_pytorch210 | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 habana/Dockerfile.habana1130_pytorch210 diff --git a/habana/Dockerfile.habana1130_pytorch210 b/habana/Dockerfile.habana1130_pytorch210 new file mode 100644 index 000000000..5c0319d0e --- /dev/null +++ b/habana/Dockerfile.habana1130_pytorch210 @@ -0,0 +1,24 @@ +FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.0:latest +LABEL maintainer="Javier Duarte " + +RUN pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.1.0+cpu.html +RUN pip install transformers datasets +RUN pip install onnx onnxruntime onnxscript +RUN pip install torch_runstats +RUN pip install scikit-image +RUN pip install absl-py +RUN pip install sporco +RUN pip install Ninja +RUN pip install ml-collections +RUN pip install keras-core +RUN pip pip install keras-cv +RUN pip pip install tensorflow-datasets +RUN pip pip install packaging ninja +# RUN MAX_JOBS=4 pip install flash-attn --no-build-isolation +RUN pip install triton +RUN pip install hls4ml[profiling] +RUN pip install open3d-cpu +RUN pip install ray[default] ray[train] ray[tune] +RUN pip install causal-conv1d +RUN pip install mamba-ssm +RUN pip install comet_ml From 553304ed248bc8d475f4ed235d029210813609f6 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Mon, 8 Apr 2024 14:02:31 -0700 Subject: [PATCH 02/15] fix --- habana/Dockerfile.habana1130_pytorch210 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/habana/Dockerfile.habana1130_pytorch210 b/habana/Dockerfile.habana1130_pytorch210 index 5c0319d0e..45ea15984 100644 --- a/habana/Dockerfile.habana1130_pytorch210 +++ b/habana/Dockerfile.habana1130_pytorch210 @@ -11,9 +11,9 @@ RUN pip install sporco RUN pip install Ninja RUN pip install ml-collections RUN pip install keras-core -RUN pip pip install keras-cv -RUN pip pip install tensorflow-datasets -RUN pip pip install packaging ninja +RUN pip install keras-cv +RUN pip install tensorflow-datasets +RUN pip install packaging ninja # RUN MAX_JOBS=4 pip install flash-attn --no-build-isolation RUN pip install triton RUN pip install hls4ml[profiling] From c668010494b416b6d8716aad184e1849d44b5d69 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Mon, 8 Apr 2024 18:11:44 -0700 Subject: [PATCH 03/15] update --- habana/Dockerfile.habana1130_pytorch210 | 11 +++-- habana/requirements_base.txt | 62 +++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 3 deletions(-) create mode 100644 habana/requirements_base.txt diff --git a/habana/Dockerfile.habana1130_pytorch210 b/habana/Dockerfile.habana1130_pytorch210 index 45ea15984..63263c4c2 100644 --- a/habana/Dockerfile.habana1130_pytorch210 +++ b/habana/Dockerfile.habana1130_pytorch210 @@ -1,7 +1,12 @@ FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.0:latest LABEL maintainer="Javier Duarte " -RUN pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.1.0+cpu.html +RUN apt-get update && \ + apt-get install -qq -y graphviz graphviz-dev + +COPY requirements_base.txt . +RUN pip install -r requirements_base.txt +RUN pip install --verbose torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.1.0+cpu.html RUN pip install transformers datasets RUN pip install onnx onnxruntime onnxscript RUN pip install torch_runstats @@ -19,6 +24,6 @@ RUN pip install triton RUN pip install hls4ml[profiling] RUN pip install open3d-cpu RUN pip install ray[default] ray[train] ray[tune] -RUN pip install causal-conv1d -RUN pip install mamba-ssm +# RUN pip install causal-conv1d +# RUN pip install mamba-ssm RUN pip install comet_ml diff --git a/habana/requirements_base.txt b/habana/requirements_base.txt new file mode 100644 index 000000000..d9f967ed0 --- /dev/null +++ b/habana/requirements_base.txt @@ -0,0 +1,62 @@ +POT +PyYAML +astropy +awkward +awkward0 +black +bokeh +boost-histogram +corner +dask +distributed +docopt +emcee +energyflow +fastjet +fastparquet +flake8 +girder-client +hdbscan +healpy +hydra-core +imageio +imageio-ffmpeg +ipyparallel +isort +jupyter +jupyterlab +kaleido +line_profiler +lmfit +lz4 +matplotlib +memory_profiler +mpl_scatter_density +mplhep +networkx +notebook +numba +numpy +pandas +papermill +parsl +particle +plotly +pre-commit +pyarrow +pydot +pygraphviz +pyhf +pymultinest +pynbody +pytest +scikit-learn +scipy +seaborn +tables +tensorboard +tqdm +uproot +vector +xxhash +zenodo_get From 4dbfbbfb46b97df2671369690436c9a505282fd5 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Mon, 11 Nov 2024 21:18:25 -0800 Subject: [PATCH 04/15] update docker --- habana/Dockerfile.habana1130_pytorch210 | 23 +--------- habana/requirements_base.txt | 57 +++++++------------------ 2 files changed, 17 insertions(+), 63 deletions(-) diff --git a/habana/Dockerfile.habana1130_pytorch210 b/habana/Dockerfile.habana1130_pytorch210 index 63263c4c2..e248bf505 100644 --- a/habana/Dockerfile.habana1130_pytorch210 +++ b/habana/Dockerfile.habana1130_pytorch210 @@ -3,27 +3,6 @@ LABEL maintainer="Javier Duarte " RUN apt-get update && \ apt-get install -qq -y graphviz graphviz-dev - + COPY requirements_base.txt . RUN pip install -r requirements_base.txt -RUN pip install --verbose torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.1.0+cpu.html -RUN pip install transformers datasets -RUN pip install onnx onnxruntime onnxscript -RUN pip install torch_runstats -RUN pip install scikit-image -RUN pip install absl-py -RUN pip install sporco -RUN pip install Ninja -RUN pip install ml-collections -RUN pip install keras-core -RUN pip install keras-cv -RUN pip install tensorflow-datasets -RUN pip install packaging ninja -# RUN MAX_JOBS=4 pip install flash-attn --no-build-isolation -RUN pip install triton -RUN pip install hls4ml[profiling] -RUN pip install open3d-cpu -RUN pip install ray[default] ray[train] ray[tune] -# RUN pip install causal-conv1d -# RUN pip install mamba-ssm -RUN pip install comet_ml diff --git a/habana/requirements_base.txt b/habana/requirements_base.txt index d9f967ed0..febab9e79 100644 --- a/habana/requirements_base.txt +++ b/habana/requirements_base.txt @@ -1,62 +1,37 @@ -POT -PyYAML -astropy +array-record +autopep8 awkward -awkward0 -black -bokeh -boost-histogram -corner -dask -distributed -docopt -emcee -energyflow +boost_histogram +click +comet-ml fastjet -fastparquet -flake8 -girder-client -hdbscan -healpy -hydra-core -imageio -imageio-ffmpeg -ipyparallel -isort +fsspec jupyter -jupyterlab -kaleido -line_profiler -lmfit -lz4 +jupyter-book matplotlib -memory_profiler -mpl_scatter_density +mlcroissant mplhep networkx +nevergrad notebook numba numpy +onnx +onnxruntime pandas papermill -parsl -particle plotly pre-commit +protobuf pyarrow -pydot -pygraphviz -pyhf -pymultinest -pynbody -pytest +ray[train,tune] scikit-learn +scikit-optimize scipy seaborn -tables -tensorboard +setGPU +tensorflow-datasets tqdm uproot vector -xxhash zenodo_get From ffd0674de81c209291b2a8c406ad51b4435c5297 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Mon, 16 Dec 2024 18:30:25 -0800 Subject: [PATCH 05/15] update --- ...e.habana1130_pytorch210 => Dockerfile.habana1151_pytorch212} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename habana/{Dockerfile.habana1130_pytorch210 => Dockerfile.habana1151_pytorch212} (67%) diff --git a/habana/Dockerfile.habana1130_pytorch210 b/habana/Dockerfile.habana1151_pytorch212 similarity index 67% rename from habana/Dockerfile.habana1130_pytorch210 rename to habana/Dockerfile.habana1151_pytorch212 index e248bf505..cbec68928 100644 --- a/habana/Dockerfile.habana1130_pytorch210 +++ b/habana/Dockerfile.habana1151_pytorch212 @@ -1,4 +1,4 @@ -FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.0:latest +FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest LABEL maintainer="Javier Duarte " RUN apt-get update && \ From 956b2b074b8329a6a6d4e5e091160c559141fdd1 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Mon, 10 Mar 2025 17:08:13 -0700 Subject: [PATCH 06/15] Update requirements_base.txt --- habana/requirements_base.txt | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/habana/requirements_base.txt b/habana/requirements_base.txt index febab9e79..4093cfa97 100644 --- a/habana/requirements_base.txt +++ b/habana/requirements_base.txt @@ -5,33 +5,27 @@ boost_histogram click comet-ml fastjet -fsspec jupyter jupyter-book matplotlib mlcroissant mplhep -networkx -nevergrad notebook numba -numpy onnx onnxruntime -pandas papermill plotly pre-commit protobuf pyarrow -ray[train,tune] +ray[tune] scikit-learn scikit-optimize scipy seaborn setGPU tensorflow-datasets -tqdm uproot vector zenodo_get From 34e84d4409084e54dda10696825a0880a0716336 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 16 Mar 2025 09:17:39 -0700 Subject: [PATCH 07/15] update --- habana/Dockerfile.habana | 10 +-- habana/Dockerfile.habana1151_pytorch212 | 8 -- habana/gaudi-pod-python-v19-1hpu.yaml | 101 ------------------------ habana/gaudi-pod-python-v19-2hpu.yaml | 101 ------------------------ habana/gaudi-pod-python-v19-3hpu.yaml | 101 ------------------------ habana/gaudi-pod-python-v19-4hpu.yaml | 101 ------------------------ habana/gaudi-pod-python-v19-5hpu.yaml | 101 ------------------------ habana/gaudi-pod-python-v19-6hpu.yaml | 101 ------------------------ habana/gaudi-pod-python-v19-7hpu.yaml | 101 ------------------------ habana/gaudi-pod-python-v19-8hpu.yaml | 101 ------------------------ habana/requirements.txt | 28 ++----- habana/requirements_base.txt | 31 -------- habana/requirements_nodeps.txt | 6 -- 13 files changed, 11 insertions(+), 880 deletions(-) delete mode 100644 habana/Dockerfile.habana1151_pytorch212 delete mode 100644 habana/gaudi-pod-python-v19-1hpu.yaml delete mode 100644 habana/gaudi-pod-python-v19-2hpu.yaml delete mode 100644 habana/gaudi-pod-python-v19-3hpu.yaml delete mode 100644 habana/gaudi-pod-python-v19-4hpu.yaml delete mode 100644 habana/gaudi-pod-python-v19-5hpu.yaml delete mode 100644 habana/gaudi-pod-python-v19-6hpu.yaml delete mode 100644 habana/gaudi-pod-python-v19-7hpu.yaml delete mode 100644 habana/gaudi-pod-python-v19-8hpu.yaml delete mode 100644 habana/requirements_base.txt delete mode 100644 habana/requirements_nodeps.txt diff --git a/habana/Dockerfile.habana b/habana/Dockerfile.habana index a3e1aed0f..2a301f23a 100644 --- a/habana/Dockerfile.habana +++ b/habana/Dockerfile.habana @@ -1,8 +1,8 @@ -FROM vault.habana.ai/gaudi-docker/1.9.0/ubuntu20.04/habanalabs/tensorflow-installer-tf-cpu-2.11.0:latest - +FROM vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest LABEL maintainer="Javier Duarte " +RUN apt-get update && \ + apt-get install -qq -y graphviz graphviz-dev + COPY requirements.txt . -COPY requirements_nodeps.txt . -RUN pip install --no-cache-dir -r requirements.txt -RUN pip install --no-cache-dir -r requirements_nodeps.txt --no-deps +RUN pip install -r requirements.txt diff --git a/habana/Dockerfile.habana1151_pytorch212 b/habana/Dockerfile.habana1151_pytorch212 deleted file mode 100644 index cbec68928..000000000 --- a/habana/Dockerfile.habana1151_pytorch212 +++ /dev/null @@ -1,8 +0,0 @@ -FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest -LABEL maintainer="Javier Duarte " - -RUN apt-get update && \ - apt-get install -qq -y graphviz graphviz-dev - -COPY requirements_base.txt . -RUN pip install -r requirements_base.txt diff --git a/habana/gaudi-pod-python-v19-1hpu.yaml b/habana/gaudi-pod-python-v19-1hpu.yaml deleted file mode 100644 index aad5f06ce..000000000 --- a/habana/gaudi-pod-python-v19-1hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-1hpu-hvd-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 1 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/gaudi-pod-python-v19-2hpu.yaml b/habana/gaudi-pod-python-v19-2hpu.yaml deleted file mode 100644 index 29342c0ca..000000000 --- a/habana/gaudi-pod-python-v19-2hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-2hpu-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 2 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/gaudi-pod-python-v19-3hpu.yaml b/habana/gaudi-pod-python-v19-3hpu.yaml deleted file mode 100644 index fc16270df..000000000 --- a/habana/gaudi-pod-python-v19-3hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-3hpu-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 3 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/gaudi-pod-python-v19-4hpu.yaml b/habana/gaudi-pod-python-v19-4hpu.yaml deleted file mode 100644 index 6bccd1d46..000000000 --- a/habana/gaudi-pod-python-v19-4hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-4hpu-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 4 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/gaudi-pod-python-v19-5hpu.yaml b/habana/gaudi-pod-python-v19-5hpu.yaml deleted file mode 100644 index cb40d37ad..000000000 --- a/habana/gaudi-pod-python-v19-5hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-5hpu-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 5 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/gaudi-pod-python-v19-6hpu.yaml b/habana/gaudi-pod-python-v19-6hpu.yaml deleted file mode 100644 index baf879982..000000000 --- a/habana/gaudi-pod-python-v19-6hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-6hpu-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 6 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/gaudi-pod-python-v19-7hpu.yaml b/habana/gaudi-pod-python-v19-7hpu.yaml deleted file mode 100644 index e716f8069..000000000 --- a/habana/gaudi-pod-python-v19-7hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-7hpu-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 7 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/gaudi-pod-python-v19-8hpu.yaml b/habana/gaudi-pod-python-v19-8hpu.yaml deleted file mode 100644 index 762229a96..000000000 --- a/habana/gaudi-pod-python-v19-8hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-8hpu-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 8 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/requirements.txt b/habana/requirements.txt index 2fc59e900..2410e55ad 100644 --- a/habana/requirements.txt +++ b/habana/requirements.txt @@ -1,47 +1,31 @@ array-record autopep8 awkward -bayesian-optimization boost_histogram click comet-ml -dill fastjet -fsspec -future -gviz-api +jupyter +jupyter-book matplotlib -mpi4py +mlcroissant mplhep -networkx -nevergrad notebook numba onnx onnxruntime -pandas papermill plotly pre-commit -promise protobuf pyarrow -ray[default]==1.6.0 -ray[tune]==1.6.0 -scikit-learn +ray[tune] +scikit-learn==1.5.2 scikit-optimize scipy seaborn setGPU -tensorboard_plugin_profile -tensorflow-addons -tensorflow-datasets==4.9.1 -tensorflow-estimator -tensorflow-hub -tensorflow-metadata -tensorflow-probability -tqdm -typeguard +tensorflow-datasets uproot vector zenodo_get diff --git a/habana/requirements_base.txt b/habana/requirements_base.txt deleted file mode 100644 index 4093cfa97..000000000 --- a/habana/requirements_base.txt +++ /dev/null @@ -1,31 +0,0 @@ -array-record -autopep8 -awkward -boost_histogram -click -comet-ml -fastjet -jupyter -jupyter-book -matplotlib -mlcroissant -mplhep -notebook -numba -onnx -onnxruntime -papermill -plotly -pre-commit -protobuf -pyarrow -ray[tune] -scikit-learn -scikit-optimize -scipy -seaborn -setGPU -tensorflow-datasets -uproot -vector -zenodo_get diff --git a/habana/requirements_nodeps.txt b/habana/requirements_nodeps.txt deleted file mode 100644 index 4f6db4fbd..000000000 --- a/habana/requirements_nodeps.txt +++ /dev/null @@ -1,6 +0,0 @@ -git+https://github.com/jpata/hep_tfds.git@31baf14defc53dcd1d7555e4a3945083e45e9304 -keras-tuner -kt-legacy -tensorflow-text -tf-models-official -tf2onnx From b81874c0e9d8543ecc360699dce6e25f8a710519 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 16 Mar 2025 09:42:26 -0700 Subject: [PATCH 08/15] numba 0.60.0 --- habana/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/habana/requirements.txt b/habana/requirements.txt index 2410e55ad..025a3da4a 100644 --- a/habana/requirements.txt +++ b/habana/requirements.txt @@ -11,7 +11,7 @@ matplotlib mlcroissant mplhep notebook -numba +numba==0.60.0 onnx onnxruntime papermill @@ -20,7 +20,7 @@ pre-commit protobuf pyarrow ray[tune] -scikit-learn==1.5.2 +scikit-learn scikit-optimize scipy seaborn From a1d34b60433af4af4ac80f2dcb1f1a52b4296196 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 16 Mar 2025 14:56:27 -0700 Subject: [PATCH 09/15] add tf --- habana/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/habana/requirements.txt b/habana/requirements.txt index 025a3da4a..27db3e805 100644 --- a/habana/requirements.txt +++ b/habana/requirements.txt @@ -25,6 +25,7 @@ scikit-optimize scipy seaborn setGPU +tensorflow-cpu==2.11.1 tensorflow-datasets uproot vector From 72747be71f6e24512d08cff28dfde0f5783a818a Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 16 Mar 2025 15:26:52 -0700 Subject: [PATCH 10/15] try habana --- mlpf/model/inference.py | 3 ++- mlpf/model/training.py | 8 ++++++-- mlpf/pipeline.py | 5 ++++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/mlpf/model/inference.py b/mlpf/model/inference.py index 8abdadd75..72c0a7a9d 100644 --- a/mlpf/model/inference.py +++ b/mlpf/model/inference.py @@ -34,7 +34,7 @@ def predict_one_batch(conv_type, model, i, batch, rank, jetdef, jet_ptcut, jet_match_dr, outpath, dir_name, sample): - + import habana_frameworks.torch.core as htcore # skip prediction if output exists outfile = f"{outpath}/preds{dir_name}/{sample}/pred_{rank}_{i}.parquet" if os.path.isfile(outfile): @@ -43,6 +43,7 @@ def predict_one_batch(conv_type, model, i, batch, rank, jetdef, jet_ptcut, jet_m # run model on batch batch = batch.to(rank) ypred = model(batch.X, batch.mask) + htcore.mark_step() # convert all outputs to float32 in case running in float16 or bfloat16 ypred = tuple([y.to(torch.float32) for y in ypred]) diff --git a/mlpf/model/training.py b/mlpf/model/training.py index 78e115b55..38343655c 100644 --- a/mlpf/model/training.py +++ b/mlpf/model/training.py @@ -76,13 +76,16 @@ def model_step(batch, model, loss_fn): def optimizer_step(model, loss_opt, optimizer, lr_schedule, scaler): + import habana_frameworks.torch.core as htcore # Clear gradients for param in model.parameters(): param.grad = None # Backward pass and optimization scaler.scale(loss_opt).backward() + htcore.mark_step() scaler.step(optimizer) + htcore.mark_step() scaler.update() if lr_schedule: lr_schedule.step() @@ -644,7 +647,8 @@ def get_relevant_directory(path): if config["conv_type"] == "attention": model_kwargs["attention_type"] = config["model"]["attention"]["attention_type"] - model = MLPF(**model_kwargs).to(torch.device(rank)) + # model = MLPF(**model_kwargs).to(torch.device(rank)) + model = MLPF(**model_kwargs).to(torch.device("hpu")) optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"]) checkpoint = torch.load(config["load"], map_location=torch.device(rank)) @@ -829,7 +833,7 @@ def override_config(config: dict, args): # Run either on CPU, single GPU or multi-GPU using pytorch -def device_agnostic_run(config, world_size, outdir): +def device_agnostic_run(config, world_size, outdir, habana=False): if config["train"]: logfile = f"{outdir}/train.log" else: diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py index 057ccf285..f3da36191 100644 --- a/mlpf/pipeline.py +++ b/mlpf/pipeline.py @@ -91,6 +91,7 @@ default=None, help="will load and run a training and log the result in the --prefix directory", ) +parser.add_argument("--habana", action="store_true", default=None, help="use Habana Gaudi processor") def get_outdir(resume_training, load): @@ -113,6 +114,8 @@ def get_outdir(resume_training, load): def main(): # https://github.com/pytorch/pytorch/issues/11201#issuecomment-895047235 import torch + if args.habana: + import habana_frameworks.torch.core as htcore torch.multiprocessing.set_sharing_strategy(SHARING_STRATEGY) @@ -176,7 +179,7 @@ def main(): if args.ray_train: run_ray_training(config, args, outdir) else: - device_agnostic_run(config, world_size, outdir) + device_agnostic_run(config, world_size, outdir, args.habana) if __name__ == "__main__": From f96eeccd2c07a8abfbd78c7d446e40ed39f6b666 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 16 Mar 2025 15:29:06 -0700 Subject: [PATCH 11/15] update --- mlpf/pipeline.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py index f3da36191..974de3c3a 100644 --- a/mlpf/pipeline.py +++ b/mlpf/pipeline.py @@ -114,13 +114,12 @@ def get_outdir(resume_training, load): def main(): # https://github.com/pytorch/pytorch/issues/11201#issuecomment-895047235 import torch - if args.habana: - import habana_frameworks.torch.core as htcore - torch.multiprocessing.set_sharing_strategy(SHARING_STRATEGY) # plt.rcParams['text.usetex'] = True args = parser.parse_args() + if args.habana: + import habana_frameworks.torch.core as htcore if args.resume_training and not args.ray_train: raise NotImplementedError( From c57048c97bb510ce7cfdc6f12e120557d7cd47bd Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 16 Mar 2025 15:37:37 -0700 Subject: [PATCH 12/15] generalize device --- mlpf/model/training.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/mlpf/model/training.py b/mlpf/model/training.py index 38343655c..cc11b6979 100644 --- a/mlpf/model/training.py +++ b/mlpf/model/training.py @@ -841,15 +841,20 @@ def device_agnostic_run(config, world_size, outdir, habana=False): _configLogger("mlpf", filename=logfile) if config["gpus"]: + if habana: + import habana_frameworks.torch.hpu as torch_device + else: + import torch.cuda as torch_device assert ( - world_size <= torch.cuda.device_count() - ), f"--gpus is too high (specified {world_size} gpus but only {torch.cuda.device_count()} gpus are available)" + world_size <= torch_device.device_count() + ), f"--gpus is too high (specified {world_size} gpus but only {torch_device.device_count()} gpus are available)" - torch.cuda.empty_cache() + if not habana: + torch.cuda.empty_cache() if world_size > 1: _logger.info(f"Will use torch.nn.parallel.DistributedDataParallel() and {world_size} gpus", color="purple") for rank in range(world_size): - _logger.info(torch.cuda.get_device_name(rank), color="purple") + _logger.info(torch_device.get_device_name(rank), color="purple") mp.spawn( run, @@ -859,7 +864,7 @@ def device_agnostic_run(config, world_size, outdir, habana=False): ) elif world_size == 1: rank = 0 - _logger.info(f"Will use single-gpu: {torch.cuda.get_device_name(rank)}", color="purple") + _logger.info(f"Will use single-gpu: {torch_device.get_device_name(rank)}", color="purple") run(rank, world_size, config, outdir, logfile) else: From 5cb738758ac6444d8c3f0527a50b3534f63ec944 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 16 Mar 2025 20:07:16 -0700 Subject: [PATCH 13/15] test --- mlpf/model/training.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mlpf/model/training.py b/mlpf/model/training.py index f8a9f17b6..b80f243e0 100644 --- a/mlpf/model/training.py +++ b/mlpf/model/training.py @@ -67,7 +67,9 @@ def configure_model_trainable(model: MLPF, trainable: Union[str, List[str]], is_ def model_step(batch, model, loss_fn): + import habana_frameworks.torch.core as htcore ypred_raw = model(batch.X, batch.mask) + htcore.mark_step() ypred = unpack_predictions(ypred_raw) ytarget = unpack_target(batch.ytarget, model) loss_opt, losses_detached = loss_fn(ytarget, ypred, batch) @@ -136,7 +138,7 @@ def train_epoch( iterator = tqdm.tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch} train loop on rank={rank}") for itrain, batch in iterator: - batch = batch.to(rank, non_blocking=True) + batch = batch.to("hpu", non_blocking=True) with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"): loss_opt, loss, _, _, _ = model_step(batch, model, mlpf_loss) @@ -344,14 +346,14 @@ def train_all_epochs( matplotlib.use("agg") # Setup tensorboard writers - if (rank == 0) or (rank == "cpu"): + if (rank == 0) or (rank == "cpu") or (rank == "hpu"): tensorboard_writer_train = SummaryWriter(f"{outdir}/runs/train") tensorboard_writer_valid = SummaryWriter(f"{outdir}/runs/valid") else: tensorboard_writer_train = None tensorboard_writer_valid = None - device_type = "cuda" if isinstance(rank, int) else "cpu" + device_type = "hpu" t0_initial = time.time() # Early stopping setup @@ -581,7 +583,7 @@ def run_test(rank, world_size, config, outdir, model, sample, testdir_name, dtyp else: raise Exception("not implemented") - device_type = "cuda" if isinstance(rank, int) else "cpu" + device_type = "hpu" with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"): run_predictions( world_size, From b57319dbcad14b50aa812ada245137ee372fd60a Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 16 Mar 2025 20:28:24 -0700 Subject: [PATCH 14/15] num_classes=2 --- mlpf/model/losses.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlpf/model/losses.py b/mlpf/model/losses.py index 59a3f2d8a..0efb34957 100644 --- a/mlpf/model/losses.py +++ b/mlpf/model/losses.py @@ -112,7 +112,7 @@ def mlpf_loss(y, ypred, batch): was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze( axis=-1 ) - was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze( + was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0, num_classes=2).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze( axis=-1 ) From 770cacac3890126ffe3216fb2df9afd43f8d5181 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 16 Mar 2025 20:29:36 -0700 Subject: [PATCH 15/15] fix --- mlpf/model/losses.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlpf/model/losses.py b/mlpf/model/losses.py index 0efb34957..e57e00831 100644 --- a/mlpf/model/losses.py +++ b/mlpf/model/losses.py @@ -112,7 +112,7 @@ def mlpf_loss(y, ypred, batch): was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze( axis=-1 ) - was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0, num_classes=2).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze( + was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long), num_classes=2), y["momentum"]], axis=-1) * batch.mask.unsqueeze( axis=-1 )