From f1d9e21b05ecbccc426863291ff42c17e69fba2e Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Mon, 8 Apr 2024 13:36:24 -0700
Subject: [PATCH 01/15] dockerfile

---
 habana/Dockerfile.habana1130_pytorch210 | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 habana/Dockerfile.habana1130_pytorch210

diff --git a/habana/Dockerfile.habana1130_pytorch210 b/habana/Dockerfile.habana1130_pytorch210
new file mode 100644
index 000000000..5c0319d0e
--- /dev/null
+++ b/habana/Dockerfile.habana1130_pytorch210
@@ -0,0 +1,24 @@
+FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.0:latest
+LABEL maintainer="Javier Duarte <jduarte@ucsd.edu>"
+
+RUN pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.1.0+cpu.html
+RUN pip install transformers datasets
+RUN pip install onnx onnxruntime onnxscript
+RUN pip install torch_runstats
+RUN pip install scikit-image
+RUN pip install absl-py
+RUN pip install sporco
+RUN pip install Ninja
+RUN pip install ml-collections
+RUN pip install keras-core
+RUN pip pip install keras-cv
+RUN pip pip install tensorflow-datasets
+RUN pip pip install packaging ninja
+# RUN MAX_JOBS=4 pip install flash-attn --no-build-isolation
+RUN pip install triton
+RUN pip install hls4ml[profiling]
+RUN pip install open3d-cpu
+RUN pip install ray[default] ray[train] ray[tune]
+RUN pip install causal-conv1d
+RUN pip install mamba-ssm
+RUN pip install comet_ml

From 553304ed248bc8d475f4ed235d029210813609f6 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Mon, 8 Apr 2024 14:02:31 -0700
Subject: [PATCH 02/15] fix

---
 habana/Dockerfile.habana1130_pytorch210 | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/habana/Dockerfile.habana1130_pytorch210 b/habana/Dockerfile.habana1130_pytorch210
index 5c0319d0e..45ea15984 100644
--- a/habana/Dockerfile.habana1130_pytorch210
+++ b/habana/Dockerfile.habana1130_pytorch210
@@ -11,9 +11,9 @@ RUN pip install sporco
 RUN pip install Ninja
 RUN pip install ml-collections
 RUN pip install keras-core
-RUN pip pip install keras-cv
-RUN pip pip install tensorflow-datasets
-RUN pip pip install packaging ninja
+RUN pip install keras-cv
+RUN pip install tensorflow-datasets
+RUN pip install packaging ninja
 # RUN MAX_JOBS=4 pip install flash-attn --no-build-isolation
 RUN pip install triton
 RUN pip install hls4ml[profiling]

From c668010494b416b6d8716aad184e1849d44b5d69 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Mon, 8 Apr 2024 18:11:44 -0700
Subject: [PATCH 03/15] update

---
 habana/Dockerfile.habana1130_pytorch210 | 11 +++--
 habana/requirements_base.txt            | 62 +++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 3 deletions(-)
 create mode 100644 habana/requirements_base.txt

diff --git a/habana/Dockerfile.habana1130_pytorch210 b/habana/Dockerfile.habana1130_pytorch210
index 45ea15984..63263c4c2 100644
--- a/habana/Dockerfile.habana1130_pytorch210
+++ b/habana/Dockerfile.habana1130_pytorch210
@@ -1,7 +1,12 @@
 FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.0:latest
 LABEL maintainer="Javier Duarte <jduarte@ucsd.edu>"
 
-RUN pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.1.0+cpu.html
+RUN apt-get update && \
+    apt-get install -qq -y graphviz graphviz-dev
+    
+COPY requirements_base.txt .
+RUN pip install -r requirements_base.txt
+RUN pip install --verbose torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.1.0+cpu.html
 RUN pip install transformers datasets
 RUN pip install onnx onnxruntime onnxscript
 RUN pip install torch_runstats
@@ -19,6 +24,6 @@ RUN pip install triton
 RUN pip install hls4ml[profiling]
 RUN pip install open3d-cpu
 RUN pip install ray[default] ray[train] ray[tune]
-RUN pip install causal-conv1d
-RUN pip install mamba-ssm
+# RUN pip install causal-conv1d
+# RUN pip install mamba-ssm
 RUN pip install comet_ml
diff --git a/habana/requirements_base.txt b/habana/requirements_base.txt
new file mode 100644
index 000000000..d9f967ed0
--- /dev/null
+++ b/habana/requirements_base.txt
@@ -0,0 +1,62 @@
+POT
+PyYAML
+astropy
+awkward
+awkward0
+black
+bokeh
+boost-histogram
+corner
+dask
+distributed
+docopt
+emcee
+energyflow
+fastjet
+fastparquet
+flake8
+girder-client
+hdbscan
+healpy
+hydra-core
+imageio
+imageio-ffmpeg
+ipyparallel
+isort
+jupyter
+jupyterlab
+kaleido
+line_profiler
+lmfit
+lz4
+matplotlib
+memory_profiler
+mpl_scatter_density
+mplhep
+networkx
+notebook
+numba
+numpy
+pandas
+papermill
+parsl
+particle
+plotly
+pre-commit
+pyarrow
+pydot
+pygraphviz
+pyhf
+pymultinest
+pynbody
+pytest
+scikit-learn
+scipy
+seaborn
+tables
+tensorboard
+tqdm
+uproot
+vector
+xxhash
+zenodo_get

From 4dbfbbfb46b97df2671369690436c9a505282fd5 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Mon, 11 Nov 2024 21:18:25 -0800
Subject: [PATCH 04/15] update docker

---
 habana/Dockerfile.habana1130_pytorch210 | 23 +---------
 habana/requirements_base.txt            | 57 +++++++------------------
 2 files changed, 17 insertions(+), 63 deletions(-)

diff --git a/habana/Dockerfile.habana1130_pytorch210 b/habana/Dockerfile.habana1130_pytorch210
index 63263c4c2..e248bf505 100644
--- a/habana/Dockerfile.habana1130_pytorch210
+++ b/habana/Dockerfile.habana1130_pytorch210
@@ -3,27 +3,6 @@ LABEL maintainer="Javier Duarte <jduarte@ucsd.edu>"
 
 RUN apt-get update && \
     apt-get install -qq -y graphviz graphviz-dev
-    
+
 COPY requirements_base.txt .
 RUN pip install -r requirements_base.txt
-RUN pip install --verbose torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.1.0+cpu.html
-RUN pip install transformers datasets
-RUN pip install onnx onnxruntime onnxscript
-RUN pip install torch_runstats
-RUN pip install scikit-image
-RUN pip install absl-py
-RUN pip install sporco
-RUN pip install Ninja
-RUN pip install ml-collections
-RUN pip install keras-core
-RUN pip install keras-cv
-RUN pip install tensorflow-datasets
-RUN pip install packaging ninja
-# RUN MAX_JOBS=4 pip install flash-attn --no-build-isolation
-RUN pip install triton
-RUN pip install hls4ml[profiling]
-RUN pip install open3d-cpu
-RUN pip install ray[default] ray[train] ray[tune]
-# RUN pip install causal-conv1d
-# RUN pip install mamba-ssm
-RUN pip install comet_ml
diff --git a/habana/requirements_base.txt b/habana/requirements_base.txt
index d9f967ed0..febab9e79 100644
--- a/habana/requirements_base.txt
+++ b/habana/requirements_base.txt
@@ -1,62 +1,37 @@
-POT
-PyYAML
-astropy
+array-record
+autopep8
 awkward
-awkward0
-black
-bokeh
-boost-histogram
-corner
-dask
-distributed
-docopt
-emcee
-energyflow
+boost_histogram
+click
+comet-ml
 fastjet
-fastparquet
-flake8
-girder-client
-hdbscan
-healpy
-hydra-core
-imageio
-imageio-ffmpeg
-ipyparallel
-isort
+fsspec
 jupyter
-jupyterlab
-kaleido
-line_profiler
-lmfit
-lz4
+jupyter-book
 matplotlib
-memory_profiler
-mpl_scatter_density
+mlcroissant
 mplhep
 networkx
+nevergrad
 notebook
 numba
 numpy
+onnx
+onnxruntime
 pandas
 papermill
-parsl
-particle
 plotly
 pre-commit
+protobuf
 pyarrow
-pydot
-pygraphviz
-pyhf
-pymultinest
-pynbody
-pytest
+ray[train,tune]
 scikit-learn
+scikit-optimize
 scipy
 seaborn
-tables
-tensorboard
+setGPU
+tensorflow-datasets
 tqdm
 uproot
 vector
-xxhash
 zenodo_get

From ffd0674de81c209291b2a8c406ad51b4435c5297 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Mon, 16 Dec 2024 18:30:25 -0800
Subject: [PATCH 05/15] update

---
 ...e.habana1130_pytorch210 => Dockerfile.habana1151_pytorch212} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename habana/{Dockerfile.habana1130_pytorch210 => Dockerfile.habana1151_pytorch212} (67%)

diff --git a/habana/Dockerfile.habana1130_pytorch210 b/habana/Dockerfile.habana1151_pytorch212
similarity index 67%
rename from habana/Dockerfile.habana1130_pytorch210
rename to habana/Dockerfile.habana1151_pytorch212
index e248bf505..cbec68928 100644
--- a/habana/Dockerfile.habana1130_pytorch210
+++ b/habana/Dockerfile.habana1151_pytorch212
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.0:latest
+FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
 LABEL maintainer="Javier Duarte <jduarte@ucsd.edu>"
 
 RUN apt-get update && \

From 956b2b074b8329a6a6d4e5e091160c559141fdd1 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Mon, 10 Mar 2025 17:08:13 -0700
Subject: [PATCH 06/15] Update requirements_base.txt

---
 habana/requirements_base.txt | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/habana/requirements_base.txt b/habana/requirements_base.txt
index febab9e79..4093cfa97 100644
--- a/habana/requirements_base.txt
+++ b/habana/requirements_base.txt
@@ -5,33 +5,27 @@ boost_histogram
 click
 comet-ml
 fastjet
-fsspec
 jupyter
 jupyter-book
 matplotlib
 mlcroissant
 mplhep
-networkx
-nevergrad
 notebook
 numba
-numpy
 onnx
 onnxruntime
-pandas
 papermill
 plotly
 pre-commit
 protobuf
 pyarrow
-ray[train,tune]
+ray[tune]
 scikit-learn
 scikit-optimize
 scipy
 seaborn
 setGPU
 tensorflow-datasets
-tqdm
 uproot
 vector
 zenodo_get

From 34e84d4409084e54dda10696825a0880a0716336 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sun, 16 Mar 2025 09:17:39 -0700
Subject: [PATCH 07/15] update

---
 habana/Dockerfile.habana                |  10 +--
 habana/Dockerfile.habana1151_pytorch212 |   8 --
 habana/gaudi-pod-python-v19-1hpu.yaml   | 101 ------------------------
 habana/gaudi-pod-python-v19-2hpu.yaml   | 101 ------------------------
 habana/gaudi-pod-python-v19-3hpu.yaml   | 101 ------------------------
 habana/gaudi-pod-python-v19-4hpu.yaml   | 101 ------------------------
 habana/gaudi-pod-python-v19-5hpu.yaml   | 101 ------------------------
 habana/gaudi-pod-python-v19-6hpu.yaml   | 101 ------------------------
 habana/gaudi-pod-python-v19-7hpu.yaml   | 101 ------------------------
 habana/gaudi-pod-python-v19-8hpu.yaml   | 101 ------------------------
 habana/requirements.txt                 |  28 ++-----
 habana/requirements_base.txt            |  31 --------
 habana/requirements_nodeps.txt          |   6 --
 13 files changed, 11 insertions(+), 880 deletions(-)
 delete mode 100644 habana/Dockerfile.habana1151_pytorch212
 delete mode 100644 habana/gaudi-pod-python-v19-1hpu.yaml
 delete mode 100644 habana/gaudi-pod-python-v19-2hpu.yaml
 delete mode 100644 habana/gaudi-pod-python-v19-3hpu.yaml
 delete mode 100644 habana/gaudi-pod-python-v19-4hpu.yaml
 delete mode 100644 habana/gaudi-pod-python-v19-5hpu.yaml
 delete mode 100644 habana/gaudi-pod-python-v19-6hpu.yaml
 delete mode 100644 habana/gaudi-pod-python-v19-7hpu.yaml
 delete mode 100644 habana/gaudi-pod-python-v19-8hpu.yaml
 delete mode 100644 habana/requirements_base.txt
 delete mode 100644 habana/requirements_nodeps.txt

diff --git a/habana/Dockerfile.habana b/habana/Dockerfile.habana
index a3e1aed0f..2a301f23a 100644
--- a/habana/Dockerfile.habana
+++ b/habana/Dockerfile.habana
@@ -1,8 +1,8 @@
-FROM vault.habana.ai/gaudi-docker/1.9.0/ubuntu20.04/habanalabs/tensorflow-installer-tf-cpu-2.11.0:latest
-
+FROM vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
 LABEL maintainer="Javier Duarte <jduarte@ucsd.edu>"
 
+RUN apt-get update && \
+    apt-get install -qq -y graphviz graphviz-dev
+
 COPY requirements.txt .
-COPY requirements_nodeps.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-RUN pip install --no-cache-dir -r requirements_nodeps.txt --no-deps
+RUN pip install -r requirements.txt
diff --git a/habana/Dockerfile.habana1151_pytorch212 b/habana/Dockerfile.habana1151_pytorch212
deleted file mode 100644
index cbec68928..000000000
--- a/habana/Dockerfile.habana1151_pytorch212
+++ /dev/null
@@ -1,8 +0,0 @@
-FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
-LABEL maintainer="Javier Duarte <jduarte@ucsd.edu>"
-
-RUN apt-get update && \
-    apt-get install -qq -y graphviz graphviz-dev
-
-COPY requirements_base.txt .
-RUN pip install -r requirements_base.txt
diff --git a/habana/gaudi-pod-python-v19-1hpu.yaml b/habana/gaudi-pod-python-v19-1hpu.yaml
deleted file mode 100644
index aad5f06ce..000000000
--- a/habana/gaudi-pod-python-v19-1hpu.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: mlpf-hpu-strategy-v19-1hpu-hvd-constbatch-bm2
-spec:
-  completions: 1
-  parallelism: 1
-  backoffLimit: 0
-  template:
-    spec:
-      restartPolicy: Never
-      serviceAccountName: jduarte
-      nodeSelector:
-        brightcomputing.com/node-category: "gaudi"
-      hostNetwork: false
-      volumes:
-        - name: home
-          hostPath:
-            path: /home/jduarte
-            type: Directory
-        - name: ceph
-          hostPath:
-            path: /voyager/ceph/users/jduarte
-            type: Directory
-        - name: scratch
-          emptyDir: {}
-      imagePullSecrets:
-        - name: registry-credentials
-      containers:
-        - name: htf2110-190-580-20230327-ubuntu2004
-          image: jmduarte/particleflow:habana_v19
-          imagePullPolicy: Always
-          resources:
-            requests:
-              cpu: 48
-              memory: 384Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 256Gi
-            limits:
-              cpu: 96
-              memory: 396Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 512Gi
-          volumeMounts:
-            - name: home
-              mountPath: /home/jduarte
-            - name: ceph
-              mountPath: /voyager/ceph/users/jduarte
-            - name: scratch
-              mountPath: /scratch
-          env:
-            - name: POD_NAME_ID
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.name
-            - name: POD_NODE_HOSTNAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: HOME
-              value: "/home/jduarte"
-            - name: CEPH
-              value: "/voyager/ceph/users/jduarte"
-            - name: LOCAL_SCRATCH_DIR
-              value: "/scratch"
-            - name: MPI_ROOT
-              value: "/opt/amazon/openmpi"
-            - name: TFDS_DATA_DIR
-              value: "/voyager/ceph/users/jduarte/tensorflow_datasets"
-          workingDir: /home/jduarte/particleflow
-          command: ["/bin/bash", "-c"]
-          args:
-            - >-
-              declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')";
-              declare -xir UNIX_TIME="$(date +'%s')";
-
-              declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}";
-
-              declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml";
-              declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})";
-
-              echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}";
-              echo "";
-
-              cat "${K8S_JOB_YAML_FILE}";
-
-              printenv;
-
-              cat /etc/os-release;
-              lscpu;
-              free -h;
-              cat /proc/meminfo;
-              lsblk --output-all;
-              cat /etc/fstab;
-              lspci -vvv;
-              hl-smi;
-              hl-smi -q;
-
-              time -p mpirun -n 1 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir;
diff --git a/habana/gaudi-pod-python-v19-2hpu.yaml b/habana/gaudi-pod-python-v19-2hpu.yaml
deleted file mode 100644
index 29342c0ca..000000000
--- a/habana/gaudi-pod-python-v19-2hpu.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: mlpf-hpu-strategy-v19-2hpu-constbatch-bm2
-spec:
-  completions: 1
-  parallelism: 1
-  backoffLimit: 0
-  template:
-    spec:
-      restartPolicy: Never
-      serviceAccountName: jduarte
-      nodeSelector:
-        brightcomputing.com/node-category: "gaudi"
-      hostNetwork: false
-      volumes:
-        - name: home
-          hostPath:
-            path: /home/jduarte
-            type: Directory
-        - name: ceph
-          hostPath:
-            path: /voyager/ceph/users/jduarte
-            type: Directory
-        - name: scratch
-          emptyDir: {}
-      imagePullSecrets:
-        - name: registry-credentials
-      containers:
-        - name: htf2110-190-580-20230327-ubuntu2004
-          image: jmduarte/particleflow:habana_v19
-          imagePullPolicy: Always
-          resources:
-            requests:
-              cpu: 48
-              memory: 384Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 256Gi
-            limits:
-              cpu: 96
-              memory: 396Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 512Gi
-          volumeMounts:
-            - name: home
-              mountPath: /home/jduarte
-            - name: ceph
-              mountPath: /voyager/ceph/users/jduarte
-            - name: scratch
-              mountPath: /scratch
-          env:
-            - name: POD_NAME_ID
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.name
-            - name: POD_NODE_HOSTNAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: HOME
-              value: "/home/jduarte"
-            - name: CEPH
-              value: "/voyager/ceph/users/jduarte"
-            - name: LOCAL_SCRATCH_DIR
-              value: "/scratch"
-            - name: MPI_ROOT
-              value: "/opt/amazon/openmpi"
-            - name: TFDS_DATA_DIR
-              value: "/voyager/ceph/users/jduarte/tensorflow_datasets"
-          workingDir: /home/jduarte/particleflow
-          command: ["/bin/bash", "-c"]
-          args:
-            - >-
-              declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')";
-              declare -xir UNIX_TIME="$(date +'%s')";
-
-              declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}";
-
-              declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml";
-              declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})";
-
-              echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}";
-              echo "";
-
-              cat "${K8S_JOB_YAML_FILE}";
-
-              printenv;
-
-              cat /etc/os-release;
-              lscpu;
-              free -h;
-              cat /proc/meminfo;
-              lsblk --output-all;
-              cat /etc/fstab;
-              lspci -vvv;
-              hl-smi;
-              hl-smi -q;
-
-              time -p mpirun -n 2 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir;
diff --git a/habana/gaudi-pod-python-v19-3hpu.yaml b/habana/gaudi-pod-python-v19-3hpu.yaml
deleted file mode 100644
index fc16270df..000000000
--- a/habana/gaudi-pod-python-v19-3hpu.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: mlpf-hpu-strategy-v19-3hpu-constbatch-bm2
-spec:
-  completions: 1
-  parallelism: 1
-  backoffLimit: 0
-  template:
-    spec:
-      restartPolicy: Never
-      serviceAccountName: jduarte
-      nodeSelector:
-        brightcomputing.com/node-category: "gaudi"
-      hostNetwork: false
-      volumes:
-        - name: home
-          hostPath:
-            path: /home/jduarte
-            type: Directory
-        - name: ceph
-          hostPath:
-            path: /voyager/ceph/users/jduarte
-            type: Directory
-        - name: scratch
-          emptyDir: {}
-      imagePullSecrets:
-        - name: registry-credentials
-      containers:
-        - name: htf2110-190-580-20230327-ubuntu2004
-          image: jmduarte/particleflow:habana_v19
-          imagePullPolicy: Always
-          resources:
-            requests:
-              cpu: 48
-              memory: 384Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 256Gi
-            limits:
-              cpu: 96
-              memory: 396Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 512Gi
-          volumeMounts:
-            - name: home
-              mountPath: /home/jduarte
-            - name: ceph
-              mountPath: /voyager/ceph/users/jduarte
-            - name: scratch
-              mountPath: /scratch
-          env:
-            - name: POD_NAME_ID
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.name
-            - name: POD_NODE_HOSTNAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: HOME
-              value: "/home/jduarte"
-            - name: CEPH
-              value: "/voyager/ceph/users/jduarte"
-            - name: LOCAL_SCRATCH_DIR
-              value: "/scratch"
-            - name: MPI_ROOT
-              value: "/opt/amazon/openmpi"
-            - name: TFDS_DATA_DIR
-              value: "/voyager/ceph/users/jduarte/tensorflow_datasets"
-          workingDir: /home/jduarte/particleflow
-          command: ["/bin/bash", "-c"]
-          args:
-            - >-
-              declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')";
-              declare -xir UNIX_TIME="$(date +'%s')";
-
-              declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}";
-
-              declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml";
-              declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})";
-
-              echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}";
-              echo "";
-
-              cat "${K8S_JOB_YAML_FILE}";
-
-              printenv;
-
-              cat /etc/os-release;
-              lscpu;
-              free -h;
-              cat /proc/meminfo;
-              lsblk --output-all;
-              cat /etc/fstab;
-              lspci -vvv;
-              hl-smi;
-              hl-smi -q;
-
-              time -p mpirun -n 3 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir;
diff --git a/habana/gaudi-pod-python-v19-4hpu.yaml b/habana/gaudi-pod-python-v19-4hpu.yaml
deleted file mode 100644
index 6bccd1d46..000000000
--- a/habana/gaudi-pod-python-v19-4hpu.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: mlpf-hpu-strategy-v19-4hpu-constbatch-bm2
-spec:
-  completions: 1
-  parallelism: 1
-  backoffLimit: 0
-  template:
-    spec:
-      restartPolicy: Never
-      serviceAccountName: jduarte
-      nodeSelector:
-        brightcomputing.com/node-category: "gaudi"
-      hostNetwork: false
-      volumes:
-        - name: home
-          hostPath:
-            path: /home/jduarte
-            type: Directory
-        - name: ceph
-          hostPath:
-            path: /voyager/ceph/users/jduarte
-            type: Directory
-        - name: scratch
-          emptyDir: {}
-      imagePullSecrets:
-        - name: registry-credentials
-      containers:
-        - name: htf2110-190-580-20230327-ubuntu2004
-          image: jmduarte/particleflow:habana_v19
-          imagePullPolicy: Always
-          resources:
-            requests:
-              cpu: 48
-              memory: 384Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 256Gi
-            limits:
-              cpu: 96
-              memory: 396Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 512Gi
-          volumeMounts:
-            - name: home
-              mountPath: /home/jduarte
-            - name: ceph
-              mountPath: /voyager/ceph/users/jduarte
-            - name: scratch
-              mountPath: /scratch
-          env:
-            - name: POD_NAME_ID
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.name
-            - name: POD_NODE_HOSTNAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: HOME
-              value: "/home/jduarte"
-            - name: CEPH
-              value: "/voyager/ceph/users/jduarte"
-            - name: LOCAL_SCRATCH_DIR
-              value: "/scratch"
-            - name: MPI_ROOT
-              value: "/opt/amazon/openmpi"
-            - name: TFDS_DATA_DIR
-              value: "/voyager/ceph/users/jduarte/tensorflow_datasets"
-          workingDir: /home/jduarte/particleflow
-          command: ["/bin/bash", "-c"]
-          args:
-            - >-
-              declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')";
-              declare -xir UNIX_TIME="$(date +'%s')";
-
-              declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}";
-
-              declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml";
-              declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})";
-
-              echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}";
-              echo "";
-
-              cat "${K8S_JOB_YAML_FILE}";
-
-              printenv;
-
-              cat /etc/os-release;
-              lscpu;
-              free -h;
-              cat /proc/meminfo;
-              lsblk --output-all;
-              cat /etc/fstab;
-              lspci -vvv;
-              hl-smi;
-              hl-smi -q;
-
-              time -p mpirun -n 4 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir;
diff --git a/habana/gaudi-pod-python-v19-5hpu.yaml b/habana/gaudi-pod-python-v19-5hpu.yaml
deleted file mode 100644
index cb40d37ad..000000000
--- a/habana/gaudi-pod-python-v19-5hpu.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: mlpf-hpu-strategy-v19-5hpu-constbatch-bm2
-spec:
-  completions: 1
-  parallelism: 1
-  backoffLimit: 0
-  template:
-    spec:
-      restartPolicy: Never
-      serviceAccountName: jduarte
-      nodeSelector:
-        brightcomputing.com/node-category: "gaudi"
-      hostNetwork: false
-      volumes:
-        - name: home
-          hostPath:
-            path: /home/jduarte
-            type: Directory
-        - name: ceph
-          hostPath:
-            path: /voyager/ceph/users/jduarte
-            type: Directory
-        - name: scratch
-          emptyDir: {}
-      imagePullSecrets:
-        - name: registry-credentials
-      containers:
-        - name: htf2110-190-580-20230327-ubuntu2004
-          image: jmduarte/particleflow:habana_v19
-          imagePullPolicy: Always
-          resources:
-            requests:
-              cpu: 48
-              memory: 384Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 256Gi
-            limits:
-              cpu: 96
-              memory: 396Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 512Gi
-          volumeMounts:
-            - name: home
-              mountPath: /home/jduarte
-            - name: ceph
-              mountPath: /voyager/ceph/users/jduarte
-            - name: scratch
-              mountPath: /scratch
-          env:
-            - name: POD_NAME_ID
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.name
-            - name: POD_NODE_HOSTNAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: HOME
-              value: "/home/jduarte"
-            - name: CEPH
-              value: "/voyager/ceph/users/jduarte"
-            - name: LOCAL_SCRATCH_DIR
-              value: "/scratch"
-            - name: MPI_ROOT
-              value: "/opt/amazon/openmpi"
-            - name: TFDS_DATA_DIR
-              value: "/voyager/ceph/users/jduarte/tensorflow_datasets"
-          workingDir: /home/jduarte/particleflow
-          command: ["/bin/bash", "-c"]
-          args:
-            - >-
-              declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')";
-              declare -xir UNIX_TIME="$(date +'%s')";
-
-              declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}";
-
-              declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml";
-              declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})";
-
-              echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}";
-              echo "";
-
-              cat "${K8S_JOB_YAML_FILE}";
-
-              printenv;
-
-              cat /etc/os-release;
-              lscpu;
-              free -h;
-              cat /proc/meminfo;
-              lsblk --output-all;
-              cat /etc/fstab;
-              lspci -vvv;
-              hl-smi;
-              hl-smi -q;
-
-              time -p mpirun -n 5 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir;
diff --git a/habana/gaudi-pod-python-v19-6hpu.yaml b/habana/gaudi-pod-python-v19-6hpu.yaml
deleted file mode 100644
index baf879982..000000000
--- a/habana/gaudi-pod-python-v19-6hpu.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: mlpf-hpu-strategy-v19-6hpu-constbatch-bm2
-spec:
-  completions: 1
-  parallelism: 1
-  backoffLimit: 0
-  template:
-    spec:
-      restartPolicy: Never
-      serviceAccountName: jduarte
-      nodeSelector:
-        brightcomputing.com/node-category: "gaudi"
-      hostNetwork: false
-      volumes:
-        - name: home
-          hostPath:
-            path: /home/jduarte
-            type: Directory
-        - name: ceph
-          hostPath:
-            path: /voyager/ceph/users/jduarte
-            type: Directory
-        - name: scratch
-          emptyDir: {}
-      imagePullSecrets:
-        - name: registry-credentials
-      containers:
-        - name: htf2110-190-580-20230327-ubuntu2004
-          image: jmduarte/particleflow:habana_v19
-          imagePullPolicy: Always
-          resources:
-            requests:
-              cpu: 48
-              memory: 384Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 256Gi
-            limits:
-              cpu: 96
-              memory: 396Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 512Gi
-          volumeMounts:
-            - name: home
-              mountPath: /home/jduarte
-            - name: ceph
-              mountPath: /voyager/ceph/users/jduarte
-            - name: scratch
-              mountPath: /scratch
-          env:
-            - name: POD_NAME_ID
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.name
-            - name: POD_NODE_HOSTNAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: HOME
-              value: "/home/jduarte"
-            - name: CEPH
-              value: "/voyager/ceph/users/jduarte"
-            - name: LOCAL_SCRATCH_DIR
-              value: "/scratch"
-            - name: MPI_ROOT
-              value: "/opt/amazon/openmpi"
-            - name: TFDS_DATA_DIR
-              value: "/voyager/ceph/users/jduarte/tensorflow_datasets"
-          workingDir: /home/jduarte/particleflow
-          command: ["/bin/bash", "-c"]
-          args:
-            - >-
-              declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')";
-              declare -xir UNIX_TIME="$(date +'%s')";
-
-              declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}";
-
-              declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml";
-              declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})";
-
-              echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}";
-              echo "";
-
-              cat "${K8S_JOB_YAML_FILE}";
-
-              printenv;
-
-              cat /etc/os-release;
-              lscpu;
-              free -h;
-              cat /proc/meminfo;
-              lsblk --output-all;
-              cat /etc/fstab;
-              lspci -vvv;
-              hl-smi;
-              hl-smi -q;
-
-              time -p mpirun -n 6 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir;
diff --git a/habana/gaudi-pod-python-v19-7hpu.yaml b/habana/gaudi-pod-python-v19-7hpu.yaml
deleted file mode 100644
index e716f8069..000000000
--- a/habana/gaudi-pod-python-v19-7hpu.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: mlpf-hpu-strategy-v19-7hpu-constbatch-bm2
-spec:
-  completions: 1
-  parallelism: 1
-  backoffLimit: 0
-  template:
-    spec:
-      restartPolicy: Never
-      serviceAccountName: jduarte
-      nodeSelector:
-        brightcomputing.com/node-category: "gaudi"
-      hostNetwork: false
-      volumes:
-        - name: home
-          hostPath:
-            path: /home/jduarte
-            type: Directory
-        - name: ceph
-          hostPath:
-            path: /voyager/ceph/users/jduarte
-            type: Directory
-        - name: scratch
-          emptyDir: {}
-      imagePullSecrets:
-        - name: registry-credentials
-      containers:
-        - name: htf2110-190-580-20230327-ubuntu2004
-          image: jmduarte/particleflow:habana_v19
-          imagePullPolicy: Always
-          resources:
-            requests:
-              cpu: 48
-              memory: 384Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 256Gi
-            limits:
-              cpu: 96
-              memory: 396Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 512Gi
-          volumeMounts:
-            - name: home
-              mountPath: /home/jduarte
-            - name: ceph
-              mountPath: /voyager/ceph/users/jduarte
-            - name: scratch
-              mountPath: /scratch
-          env:
-            - name: POD_NAME_ID
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.name
-            - name: POD_NODE_HOSTNAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: HOME
-              value: "/home/jduarte"
-            - name: CEPH
-              value: "/voyager/ceph/users/jduarte"
-            - name: LOCAL_SCRATCH_DIR
-              value: "/scratch"
-            - name: MPI_ROOT
-              value: "/opt/amazon/openmpi"
-            - name: TFDS_DATA_DIR
-              value: "/voyager/ceph/users/jduarte/tensorflow_datasets"
-          workingDir: /home/jduarte/particleflow
-          command: ["/bin/bash", "-c"]
-          args:
-            - >-
-              declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')";
-              declare -xir UNIX_TIME="$(date +'%s')";
-
-              declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}";
-
-              declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml";
-              declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})";
-
-              echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}";
-              echo "";
-
-              cat "${K8S_JOB_YAML_FILE}";
-
-              printenv;
-
-              cat /etc/os-release;
-              lscpu;
-              free -h;
-              cat /proc/meminfo;
-              lsblk --output-all;
-              cat /etc/fstab;
-              lspci -vvv;
-              hl-smi;
-              hl-smi -q;
-
-              time -p mpirun -n 7 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir;
diff --git a/habana/gaudi-pod-python-v19-8hpu.yaml b/habana/gaudi-pod-python-v19-8hpu.yaml
deleted file mode 100644
index 762229a96..000000000
--- a/habana/gaudi-pod-python-v19-8hpu.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: mlpf-hpu-strategy-v19-8hpu-constbatch-bm2
-spec:
-  completions: 1
-  parallelism: 1
-  backoffLimit: 0
-  template:
-    spec:
-      restartPolicy: Never
-      serviceAccountName: jduarte
-      nodeSelector:
-        brightcomputing.com/node-category: "gaudi"
-      hostNetwork: false
-      volumes:
-        - name: home
-          hostPath:
-            path: /home/jduarte
-            type: Directory
-        - name: ceph
-          hostPath:
-            path: /voyager/ceph/users/jduarte
-            type: Directory
-        - name: scratch
-          emptyDir: {}
-      imagePullSecrets:
-        - name: registry-credentials
-      containers:
-        - name: htf2110-190-580-20230327-ubuntu2004
-          image: jmduarte/particleflow:habana_v19
-          imagePullPolicy: Always
-          resources:
-            requests:
-              cpu: 48
-              memory: 384Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 256Gi
-            limits:
-              cpu: 96
-              memory: 396Gi
-              habana.ai/gaudi: 8
-              hugepages-2Mi: 96000Mi
-              ephemeral-storage: 512Gi
-          volumeMounts:
-            - name: home
-              mountPath: /home/jduarte
-            - name: ceph
-              mountPath: /voyager/ceph/users/jduarte
-            - name: scratch
-              mountPath: /scratch
-          env:
-            - name: POD_NAME_ID
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.name
-            - name: POD_NODE_HOSTNAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: HOME
-              value: "/home/jduarte"
-            - name: CEPH
-              value: "/voyager/ceph/users/jduarte"
-            - name: LOCAL_SCRATCH_DIR
-              value: "/scratch"
-            - name: MPI_ROOT
-              value: "/opt/amazon/openmpi"
-            - name: TFDS_DATA_DIR
-              value: "/voyager/ceph/users/jduarte/tensorflow_datasets"
-          workingDir: /home/jduarte/particleflow
-          command: ["/bin/bash", "-c"]
-          args:
-            - >-
-              declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')";
-              declare -xir UNIX_TIME="$(date +'%s')";
-
-              declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}";
-
-              declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml";
-              declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})";
-
-              echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}";
-              echo "";
-
-              cat "${K8S_JOB_YAML_FILE}";
-
-              printenv;
-
-              cat /etc/os-release;
-              lscpu;
-              free -h;
-              cat /proc/meminfo;
-              lsblk --output-all;
-              cat /etc/fstab;
-              lspci -vvv;
-              hl-smi;
-              hl-smi -q;
-
-              time -p mpirun -n 8 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir;
diff --git a/habana/requirements.txt b/habana/requirements.txt
index 2fc59e900..2410e55ad 100644
--- a/habana/requirements.txt
+++ b/habana/requirements.txt
@@ -1,47 +1,31 @@
 array-record
 autopep8
 awkward
-bayesian-optimization
 boost_histogram
 click
 comet-ml
-dill
 fastjet
-fsspec
-future
-gviz-api
+jupyter
+jupyter-book
 matplotlib
-mpi4py
+mlcroissant
 mplhep
-networkx
-nevergrad
 notebook
 numba
 onnx
 onnxruntime
-pandas
 papermill
 plotly
 pre-commit
-promise
 protobuf
 pyarrow
-ray[default]==1.6.0
-ray[tune]==1.6.0
-scikit-learn
+ray[tune]
+scikit-learn==1.5.2
 scikit-optimize
 scipy
 seaborn
 setGPU
-tensorboard_plugin_profile
-tensorflow-addons
-tensorflow-datasets==4.9.1
-tensorflow-estimator
-tensorflow-hub
-tensorflow-metadata
-tensorflow-probability
-tqdm
-typeguard
+tensorflow-datasets
 uproot
 vector
 zenodo_get
diff --git a/habana/requirements_base.txt b/habana/requirements_base.txt
deleted file mode 100644
index 4093cfa97..000000000
--- a/habana/requirements_base.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-array-record
-autopep8
-awkward
-boost_histogram
-click
-comet-ml
-fastjet
-jupyter
-jupyter-book
-matplotlib
-mlcroissant
-mplhep
-notebook
-numba
-onnx
-onnxruntime
-papermill
-plotly
-pre-commit
-protobuf
-pyarrow
-ray[tune]
-scikit-learn
-scikit-optimize
-scipy
-seaborn
-setGPU
-tensorflow-datasets
-uproot
-vector
-zenodo_get
diff --git a/habana/requirements_nodeps.txt b/habana/requirements_nodeps.txt
deleted file mode 100644
index 4f6db4fbd..000000000
--- a/habana/requirements_nodeps.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-git+https://github.com/jpata/hep_tfds.git@31baf14defc53dcd1d7555e4a3945083e45e9304
-keras-tuner
-kt-legacy
-tensorflow-text
-tf-models-official
-tf2onnx

From b81874c0e9d8543ecc360699dce6e25f8a710519 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sun, 16 Mar 2025 09:42:26 -0700
Subject: [PATCH 08/15] numba 0.60.0

---
 habana/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/habana/requirements.txt b/habana/requirements.txt
index 2410e55ad..025a3da4a 100644
--- a/habana/requirements.txt
+++ b/habana/requirements.txt
@@ -11,7 +11,7 @@ matplotlib
 mlcroissant
 mplhep
 notebook
-numba
+numba==0.60.0
 onnx
 onnxruntime
 papermill
@@ -20,7 +20,7 @@ pre-commit
 protobuf
 pyarrow
 ray[tune]
-scikit-learn==1.5.2
+scikit-learn
 scikit-optimize
 scipy
 seaborn

From a1d34b60433af4af4ac80f2dcb1f1a52b4296196 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sun, 16 Mar 2025 14:56:27 -0700
Subject: [PATCH 09/15] add tf

---
 habana/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/habana/requirements.txt b/habana/requirements.txt
index 025a3da4a..27db3e805 100644
--- a/habana/requirements.txt
+++ b/habana/requirements.txt
@@ -25,6 +25,7 @@ scikit-optimize
 scipy
 seaborn
 setGPU
+tensorflow-cpu==2.11.1
 tensorflow-datasets
 uproot
 vector

From 72747be71f6e24512d08cff28dfde0f5783a818a Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sun, 16 Mar 2025 15:26:52 -0700
Subject: [PATCH 10/15] try habana

---
 mlpf/model/inference.py | 3 ++-
 mlpf/model/training.py  | 8 ++++++--
 mlpf/pipeline.py        | 5 ++++-
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/mlpf/model/inference.py b/mlpf/model/inference.py
index 8abdadd75..72c0a7a9d 100644
--- a/mlpf/model/inference.py
+++ b/mlpf/model/inference.py
@@ -34,7 +34,7 @@
 
 
 def predict_one_batch(conv_type, model, i, batch, rank, jetdef, jet_ptcut, jet_match_dr, outpath, dir_name, sample):
-
+    import habana_frameworks.torch.core as htcore
     # skip prediction if output exists
     outfile = f"{outpath}/preds{dir_name}/{sample}/pred_{rank}_{i}.parquet"
     if os.path.isfile(outfile):
@@ -43,6 +43,7 @@ def predict_one_batch(conv_type, model, i, batch, rank, jetdef, jet_ptcut, jet_m
     # run model on batch
     batch = batch.to(rank)
     ypred = model(batch.X, batch.mask)
+    htcore.mark_step()
 
     # convert all outputs to float32 in case running in float16 or bfloat16
     ypred = tuple([y.to(torch.float32) for y in ypred])
diff --git a/mlpf/model/training.py b/mlpf/model/training.py
index 78e115b55..38343655c 100644
--- a/mlpf/model/training.py
+++ b/mlpf/model/training.py
@@ -76,13 +76,16 @@ def model_step(batch, model, loss_fn):
 
 
 def optimizer_step(model, loss_opt, optimizer, lr_schedule, scaler):
+    import habana_frameworks.torch.core as htcore
     # Clear gradients
     for param in model.parameters():
         param.grad = None
 
     # Backward pass and optimization
     scaler.scale(loss_opt).backward()
+    htcore.mark_step()
     scaler.step(optimizer)
+    htcore.mark_step()
     scaler.update()
     if lr_schedule:
         lr_schedule.step()
@@ -644,7 +647,8 @@ def get_relevant_directory(path):
         if config["conv_type"] == "attention":
             model_kwargs["attention_type"] = config["model"]["attention"]["attention_type"]
 
-        model = MLPF(**model_kwargs).to(torch.device(rank))
+        # model = MLPF(**model_kwargs).to(torch.device(rank))
+        model = MLPF(**model_kwargs).to(torch.device("hpu"))
         optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"])
 
         checkpoint = torch.load(config["load"], map_location=torch.device(rank))
@@ -829,7 +833,7 @@ def override_config(config: dict, args):
 
 
 # Run either on CPU, single GPU or multi-GPU using pytorch
-def device_agnostic_run(config, world_size, outdir):
+def device_agnostic_run(config, world_size, outdir, habana=False):
     if config["train"]:
         logfile = f"{outdir}/train.log"
     else:
diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 057ccf285..f3da36191 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -91,6 +91,7 @@
     default=None,
     help="will load and run a training and log the result in the --prefix directory",
 )
+parser.add_argument("--habana", action="store_true", default=None, help="use Habana Gaudi processor")
 
 
 def get_outdir(resume_training, load):
@@ -113,6 +114,8 @@ def get_outdir(resume_training, load):
 def main():
     # https://github.com/pytorch/pytorch/issues/11201#issuecomment-895047235
     import torch
+    if args.habana:
+        import habana_frameworks.torch.core as htcore
 
     torch.multiprocessing.set_sharing_strategy(SHARING_STRATEGY)
 
@@ -176,7 +179,7 @@ def main():
         if args.ray_train:
             run_ray_training(config, args, outdir)
         else:
-            device_agnostic_run(config, world_size, outdir)
+            device_agnostic_run(config, world_size, outdir, args.habana)
 
 
 if __name__ == "__main__":

From f96eeccd2c07a8abfbd78c7d446e40ed39f6b666 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sun, 16 Mar 2025 15:29:06 -0700
Subject: [PATCH 11/15] update

---
 mlpf/pipeline.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index f3da36191..974de3c3a 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -114,13 +114,12 @@ def get_outdir(resume_training, load):
 def main():
     # https://github.com/pytorch/pytorch/issues/11201#issuecomment-895047235
     import torch
-    if args.habana:
-        import habana_frameworks.torch.core as htcore
-
     torch.multiprocessing.set_sharing_strategy(SHARING_STRATEGY)
 
     # plt.rcParams['text.usetex'] = True
     args = parser.parse_args()
+    if args.habana:
+        import habana_frameworks.torch.core as htcore
 
     if args.resume_training and not args.ray_train:
         raise NotImplementedError(

From c57048c97bb510ce7cfdc6f12e120557d7cd47bd Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sun, 16 Mar 2025 15:37:37 -0700
Subject: [PATCH 12/15] generalize device

---
 mlpf/model/training.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/mlpf/model/training.py b/mlpf/model/training.py
index 38343655c..cc11b6979 100644
--- a/mlpf/model/training.py
+++ b/mlpf/model/training.py
@@ -841,15 +841,20 @@ def device_agnostic_run(config, world_size, outdir, habana=False):
     _configLogger("mlpf", filename=logfile)
 
     if config["gpus"]:
+        if habana:
+            import habana_frameworks.torch.hpu as torch_device
+        else:
+            import torch.cuda as torch_device
         assert (
-            world_size <= torch.cuda.device_count()
-        ), f"--gpus is too high (specified {world_size} gpus but only {torch.cuda.device_count()} gpus are available)"
+            world_size <= torch_device.device_count()
+        ), f"--gpus is too high (specified {world_size} gpus but only {torch_device.device_count()} gpus are available)"
 
-        torch.cuda.empty_cache()
+        if not habana:
+            torch.cuda.empty_cache()
         if world_size > 1:
             _logger.info(f"Will use torch.nn.parallel.DistributedDataParallel() and {world_size} gpus", color="purple")
             for rank in range(world_size):
-                _logger.info(torch.cuda.get_device_name(rank), color="purple")
+                _logger.info(torch_device.get_device_name(rank), color="purple")
 
             mp.spawn(
                 run,
@@ -859,7 +864,7 @@ def device_agnostic_run(config, world_size, outdir, habana=False):
             )
         elif world_size == 1:
             rank = 0
-            _logger.info(f"Will use single-gpu: {torch.cuda.get_device_name(rank)}", color="purple")
+            _logger.info(f"Will use single-gpu: {torch_device.get_device_name(rank)}", color="purple")
             run(rank, world_size, config, outdir, logfile)
 
     else:

From 5cb738758ac6444d8c3f0527a50b3534f63ec944 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sun, 16 Mar 2025 20:07:16 -0700
Subject: [PATCH 13/15] test

---
 mlpf/model/training.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mlpf/model/training.py b/mlpf/model/training.py
index f8a9f17b6..b80f243e0 100644
--- a/mlpf/model/training.py
+++ b/mlpf/model/training.py
@@ -67,7 +67,9 @@ def configure_model_trainable(model: MLPF, trainable: Union[str, List[str]], is_
 
 
 def model_step(batch, model, loss_fn):
+    import habana_frameworks.torch.core as htcore
     ypred_raw = model(batch.X, batch.mask)
+    htcore.mark_step()
     ypred = unpack_predictions(ypred_raw)
     ytarget = unpack_target(batch.ytarget, model)
     loss_opt, losses_detached = loss_fn(ytarget, ypred, batch)
@@ -136,7 +138,7 @@ def train_epoch(
         iterator = tqdm.tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch} train loop on rank={rank}")
 
     for itrain, batch in iterator:
-        batch = batch.to(rank, non_blocking=True)
+        batch = batch.to("hpu", non_blocking=True)
 
         with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"):
             loss_opt, loss, _, _, _ = model_step(batch, model, mlpf_loss)
@@ -344,14 +346,14 @@ def train_all_epochs(
     matplotlib.use("agg")
 
     # Setup tensorboard writers
-    if (rank == 0) or (rank == "cpu"):
+    if (rank == 0) or (rank == "cpu") or (rank == "hpu"):
         tensorboard_writer_train = SummaryWriter(f"{outdir}/runs/train")
         tensorboard_writer_valid = SummaryWriter(f"{outdir}/runs/valid")
     else:
         tensorboard_writer_train = None
         tensorboard_writer_valid = None
 
-    device_type = "cuda" if isinstance(rank, int) else "cpu"
+    device_type = "hpu"
     t0_initial = time.time()
 
     # Early stopping setup
@@ -581,7 +583,7 @@ def run_test(rank, world_size, config, outdir, model, sample, testdir_name, dtyp
     else:
         raise Exception("not implemented")
 
-    device_type = "cuda" if isinstance(rank, int) else "cpu"
+    device_type = "hpu"
     with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"):
         run_predictions(
             world_size,

From b57319dbcad14b50aa812ada245137ee372fd60a Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sun, 16 Mar 2025 20:28:24 -0700
Subject: [PATCH 14/15] num_classes=2

---
 mlpf/model/losses.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlpf/model/losses.py b/mlpf/model/losses.py
index 59a3f2d8a..0efb34957 100644
--- a/mlpf/model/losses.py
+++ b/mlpf/model/losses.py
@@ -112,7 +112,7 @@ def mlpf_loss(y, ypred, batch):
     was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze(
         axis=-1
     )
-    was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze(
+    was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0, num_classes=2).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze(
         axis=-1
     )
 

From 770cacac3890126ffe3216fb2df9afd43f8d5181 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sun, 16 Mar 2025 20:29:36 -0700
Subject: [PATCH 15/15] fix

---
 mlpf/model/losses.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlpf/model/losses.py b/mlpf/model/losses.py
index 0efb34957..e57e00831 100644
--- a/mlpf/model/losses.py
+++ b/mlpf/model/losses.py
@@ -112,7 +112,7 @@ def mlpf_loss(y, ypred, batch):
     was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze(
         axis=-1
     )
-    was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0, num_classes=2).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze(
+    was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long), num_classes=2), y["momentum"]], axis=-1) * batch.mask.unsqueeze(
         axis=-1
     )