diff --git a/habana/Dockerfile.habana b/habana/Dockerfile.habana index a3e1aed0f..2a301f23a 100644 --- a/habana/Dockerfile.habana +++ b/habana/Dockerfile.habana @@ -1,8 +1,8 @@ -FROM vault.habana.ai/gaudi-docker/1.9.0/ubuntu20.04/habanalabs/tensorflow-installer-tf-cpu-2.11.0:latest - +FROM vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest LABEL maintainer="Javier Duarte " +RUN apt-get update && \ + apt-get install -qq -y graphviz graphviz-dev + COPY requirements.txt . -COPY requirements_nodeps.txt . -RUN pip install --no-cache-dir -r requirements.txt -RUN pip install --no-cache-dir -r requirements_nodeps.txt --no-deps +RUN pip install -r requirements.txt diff --git a/habana/gaudi-pod-python-v19-1hpu.yaml b/habana/gaudi-pod-python-v19-1hpu.yaml deleted file mode 100644 index aad5f06ce..000000000 --- a/habana/gaudi-pod-python-v19-1hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-1hpu-hvd-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 1 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/gaudi-pod-python-v19-2hpu.yaml b/habana/gaudi-pod-python-v19-2hpu.yaml deleted file mode 100644 index 29342c0ca..000000000 --- a/habana/gaudi-pod-python-v19-2hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-2hpu-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 2 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/gaudi-pod-python-v19-3hpu.yaml b/habana/gaudi-pod-python-v19-3hpu.yaml deleted file mode 100644 index fc16270df..000000000 --- a/habana/gaudi-pod-python-v19-3hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-3hpu-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 3 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/gaudi-pod-python-v19-4hpu.yaml b/habana/gaudi-pod-python-v19-4hpu.yaml deleted file mode 100644 index 6bccd1d46..000000000 --- a/habana/gaudi-pod-python-v19-4hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-4hpu-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 4 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/gaudi-pod-python-v19-5hpu.yaml b/habana/gaudi-pod-python-v19-5hpu.yaml deleted file mode 100644 index cb40d37ad..000000000 --- a/habana/gaudi-pod-python-v19-5hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-5hpu-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 5 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/gaudi-pod-python-v19-6hpu.yaml b/habana/gaudi-pod-python-v19-6hpu.yaml deleted file mode 100644 index baf879982..000000000 --- a/habana/gaudi-pod-python-v19-6hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-6hpu-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 6 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/gaudi-pod-python-v19-7hpu.yaml b/habana/gaudi-pod-python-v19-7hpu.yaml deleted file mode 100644 index e716f8069..000000000 --- a/habana/gaudi-pod-python-v19-7hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-7hpu-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 7 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/gaudi-pod-python-v19-8hpu.yaml b/habana/gaudi-pod-python-v19-8hpu.yaml deleted file mode 100644 index 762229a96..000000000 --- a/habana/gaudi-pod-python-v19-8hpu.yaml +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: mlpf-hpu-strategy-v19-8hpu-constbatch-bm2 -spec: - completions: 1 - parallelism: 1 - backoffLimit: 0 - template: - spec: - restartPolicy: Never - serviceAccountName: jduarte - nodeSelector: - brightcomputing.com/node-category: "gaudi" - hostNetwork: false - volumes: - - name: home - hostPath: - path: /home/jduarte - type: Directory - - name: ceph - hostPath: - path: /voyager/ceph/users/jduarte - type: Directory - - name: scratch - emptyDir: {} - imagePullSecrets: - - name: registry-credentials - containers: - - name: htf2110-190-580-20230327-ubuntu2004 - image: jmduarte/particleflow:habana_v19 - imagePullPolicy: Always - resources: - requests: - cpu: 48 - memory: 384Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 256Gi - limits: - cpu: 96 - memory: 396Gi - habana.ai/gaudi: 8 - hugepages-2Mi: 96000Mi - ephemeral-storage: 512Gi - volumeMounts: - - name: home - mountPath: /home/jduarte - - name: ceph - mountPath: /voyager/ceph/users/jduarte - - name: scratch - mountPath: /scratch - env: - - name: POD_NAME_ID - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NODE_HOSTNAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: HOME - value: "/home/jduarte" - - name: CEPH - value: "/voyager/ceph/users/jduarte" - - name: LOCAL_SCRATCH_DIR - value: "/scratch" - - name: MPI_ROOT - value: "/opt/amazon/openmpi" - - name: TFDS_DATA_DIR - value: "/voyager/ceph/users/jduarte/tensorflow_datasets" - workingDir: /home/jduarte/particleflow - command: ["/bin/bash", "-c"] - args: - - >- - declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')"; - declare -xir UNIX_TIME="$(date +'%s')"; - - declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}"; - - declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml"; - declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})"; - - echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}"; - echo ""; - - cat "${K8S_JOB_YAML_FILE}"; - - printenv; - - cat /etc/os-release; - lscpu; - free -h; - cat /proc/meminfo; - lsblk --output-all; - cat /etc/fstab; - lspci -vvv; - hl-smi; - hl-smi -q; - - time -p mpirun -n 8 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir; diff --git a/habana/requirements.txt b/habana/requirements.txt index 2fc59e900..27db3e805 100644 --- a/habana/requirements.txt +++ b/habana/requirements.txt @@ -1,47 +1,32 @@ array-record autopep8 awkward -bayesian-optimization boost_histogram click comet-ml -dill fastjet -fsspec -future -gviz-api +jupyter +jupyter-book matplotlib -mpi4py +mlcroissant mplhep -networkx -nevergrad notebook -numba +numba==0.60.0 onnx onnxruntime -pandas papermill plotly pre-commit -promise protobuf pyarrow -ray[default]==1.6.0 -ray[tune]==1.6.0 +ray[tune] scikit-learn scikit-optimize scipy seaborn setGPU -tensorboard_plugin_profile -tensorflow-addons -tensorflow-datasets==4.9.1 -tensorflow-estimator -tensorflow-hub -tensorflow-metadata -tensorflow-probability -tqdm -typeguard +tensorflow-cpu==2.11.1 +tensorflow-datasets uproot vector zenodo_get diff --git a/habana/requirements_nodeps.txt b/habana/requirements_nodeps.txt deleted file mode 100644 index 4f6db4fbd..000000000 --- a/habana/requirements_nodeps.txt +++ /dev/null @@ -1,6 +0,0 @@ -git+https://github.com/jpata/hep_tfds.git@31baf14defc53dcd1d7555e4a3945083e45e9304 -keras-tuner -kt-legacy -tensorflow-text -tf-models-official -tf2onnx diff --git a/mlpf/model/inference.py b/mlpf/model/inference.py index 890d42364..1d39b6314 100644 --- a/mlpf/model/inference.py +++ b/mlpf/model/inference.py @@ -34,7 +34,7 @@ def predict_one_batch(conv_type, model, i, batch, rank, jetdef, jet_ptcut, jet_match_dr, outpath, dir_name, sample): - + import habana_frameworks.torch.core as htcore # skip prediction if output exists outfile = f"{outpath}/preds{dir_name}/{sample}/pred_{rank}_{i}.parquet" if os.path.isfile(outfile): @@ -43,6 +43,7 @@ def predict_one_batch(conv_type, model, i, batch, rank, jetdef, jet_ptcut, jet_m # run model on batch batch = batch.to(rank) ypred = model(batch.X, batch.mask) + htcore.mark_step() # convert all outputs to float32 in case running in float16 or bfloat16 ypred = tuple([y.to(torch.float32) for y in ypred]) diff --git a/mlpf/model/losses.py b/mlpf/model/losses.py index 59a3f2d8a..e57e00831 100644 --- a/mlpf/model/losses.py +++ b/mlpf/model/losses.py @@ -112,7 +112,7 @@ def mlpf_loss(y, ypred, batch): was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze( axis=-1 ) - was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze( + was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long), num_classes=2), y["momentum"]], axis=-1) * batch.mask.unsqueeze( axis=-1 ) diff --git a/mlpf/model/training.py b/mlpf/model/training.py index b1ce90a55..b80f243e0 100644 --- a/mlpf/model/training.py +++ b/mlpf/model/training.py @@ -67,7 +67,9 @@ def configure_model_trainable(model: MLPF, trainable: Union[str, List[str]], is_ def model_step(batch, model, loss_fn): + import habana_frameworks.torch.core as htcore ypred_raw = model(batch.X, batch.mask) + htcore.mark_step() ypred = unpack_predictions(ypred_raw) ytarget = unpack_target(batch.ytarget, model) loss_opt, losses_detached = loss_fn(ytarget, ypred, batch) @@ -75,13 +77,16 @@ def model_step(batch, model, loss_fn): def optimizer_step(model, loss_opt, optimizer, lr_schedule, scaler): + import habana_frameworks.torch.core as htcore # Clear gradients for param in model.parameters(): param.grad = None # Backward pass and optimization scaler.scale(loss_opt).backward() + htcore.mark_step() scaler.step(optimizer) + htcore.mark_step() scaler.update() if lr_schedule: lr_schedule.step() @@ -133,7 +138,7 @@ def train_epoch( iterator = tqdm.tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch} train loop on rank={rank}") for itrain, batch in iterator: - batch = batch.to(rank, non_blocking=True) + batch = batch.to("hpu", non_blocking=True) with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"): loss_opt, loss, _, _, _ = model_step(batch, model, mlpf_loss) @@ -341,14 +346,14 @@ def train_all_epochs( matplotlib.use("agg") # Setup tensorboard writers - if (rank == 0) or (rank == "cpu"): + if (rank == 0) or (rank == "cpu") or (rank == "hpu"): tensorboard_writer_train = SummaryWriter(f"{outdir}/runs/train") tensorboard_writer_valid = SummaryWriter(f"{outdir}/runs/valid") else: tensorboard_writer_train = None tensorboard_writer_valid = None - device_type = "cuda" if isinstance(rank, int) else "cpu" + device_type = "hpu" t0_initial = time.time() # Early stopping setup @@ -578,7 +583,7 @@ def run_test(rank, world_size, config, outdir, model, sample, testdir_name, dtyp else: raise Exception("not implemented") - device_type = "cuda" if isinstance(rank, int) else "cpu" + device_type = "hpu" with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"): run_predictions( world_size, @@ -630,7 +635,7 @@ def run(rank, world_size, config, outdir, logfile): # load a pre-trained checkpoint (continue an aborted training or fine-tune) if config["load"]: - model = MLPF(**model_kwargs).to(torch.device(rank)) + model = MLPF(**model_kwargs).to(torch.device("hpu")) optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"]) checkpoint = torch.load(config["load"], map_location=torch.device(rank)) @@ -675,7 +680,7 @@ def run(rank, world_size, config, outdir, logfile): model = MLPF(**model_kwargs) optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"]) - model.to(rank) + model.to(torch.device("hpu")) configure_model_trainable(model, config["model"]["trainable"], True) if world_size > 1: @@ -815,7 +820,7 @@ def override_config(config: dict, args): # Run either on CPU, single GPU or multi-GPU using pytorch -def device_agnostic_run(config, world_size, outdir): +def device_agnostic_run(config, world_size, outdir, habana=False): if config["train"]: logfile = f"{outdir}/train.log" else: @@ -823,15 +828,20 @@ def device_agnostic_run(config, world_size, outdir): _configLogger("mlpf", filename=logfile) if config["gpus"]: + if habana: + import habana_frameworks.torch.hpu as torch_device + else: + import torch.cuda as torch_device assert ( - world_size <= torch.cuda.device_count() - ), f"--gpus is too high (specified {world_size} gpus but only {torch.cuda.device_count()} gpus are available)" + world_size <= torch_device.device_count() + ), f"--gpus is too high (specified {world_size} gpus but only {torch_device.device_count()} gpus are available)" - torch.cuda.empty_cache() + if not habana: + torch.cuda.empty_cache() if world_size > 1: _logger.info(f"Will use torch.nn.parallel.DistributedDataParallel() and {world_size} gpus", color="purple") for rank in range(world_size): - _logger.info(torch.cuda.get_device_name(rank), color="purple") + _logger.info(torch_device.get_device_name(rank), color="purple") mp.spawn( run, @@ -841,7 +851,7 @@ def device_agnostic_run(config, world_size, outdir): ) elif world_size == 1: rank = 0 - _logger.info(f"Will use single-gpu: {torch.cuda.get_device_name(rank)}", color="purple") + _logger.info(f"Will use single-gpu: {torch_device.get_device_name(rank)}", color="purple") run(rank, world_size, config, outdir, logfile) else: diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py index 70a869da6..5f0244bcc 100644 --- a/mlpf/pipeline.py +++ b/mlpf/pipeline.py @@ -90,15 +90,18 @@ parser.add_argument("--ray-gpus", type=int, default=None, help="GPUs for ray-train") parser.add_argument("--raytune-num-samples", type=int, default=None, help="Number of samples to draw from search space") +# option for habana training +parser.add_argument("--habana", action="store_true", default=None, help="use Habana Gaudi device for training") def main(): # https://github.com/pytorch/pytorch/issues/11201#issuecomment-895047235 import torch - torch.multiprocessing.set_sharing_strategy(SHARING_STRATEGY) # plt.rcParams['text.usetex'] = True args = parser.parse_args() + if args.habana: + import habana_frameworks.torch.core as htcore logging.basicConfig(level=logging.INFO) world_size = args.gpus if args.gpus > 0 else 1 # will be 1 for both cpu (args.gpu < 1) and single-gpu (1) @@ -150,7 +153,7 @@ def main(): if args.ray_train: run_ray_training(config, args, experiment_dir) else: - device_agnostic_run(config, world_size, experiment_dir) + device_agnostic_run(config, world_size, experiment_dir, args.habana) if __name__ == "__main__":