Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
ac1780c
ci for testing gpu metrics in eks
garvit3835 Jun 1, 2025
11cee4f
ci for testing gpu metrics in eks
garvit3835 Jun 1, 2025
0d4c5c9
ci for testing gpu metrics in eks
garvit3835 Jun 1, 2025
d67f0a5
ci for testing gpu metrics in eks
garvit3835 Jun 1, 2025
bd128a5
ci for testing gpu metrics in eks
garvit3835 Jun 1, 2025
2cc613d
ci for testing gpu metrics in eks
garvit3835 Jun 1, 2025
d060fef
ci for testing gpu metrics in eks
garvit3835 Jun 1, 2025
921d752
ci for testing gpu metrics in eks
garvit3835 Jun 1, 2025
cc7157e
update in gpu test ci
garvit3835 Jun 3, 2025
f06cebf
matrix in ci for devzero and nvidia dcgm
garvit3835 Jun 3, 2025
db37092
matrix in ci for devzero and nvidia dcgm
garvit3835 Jun 3, 2025
d211df0
matrix in ci for devzero and nvidia dcgm
garvit3835 Jun 3, 2025
8c4bed0
using makefile to install zxporter in ci
garvit3835 Jun 3, 2025
9aeba33
fix in aws gpu test ci
garvit3835 Jun 3, 2025
a1b41d2
fix in aws gpu test ci
garvit3835 Jun 3, 2025
e959f97
Merge pull request #106 from devzero-inc/main
garvit3835 Jun 3, 2025
db0605c
update in gpu test ci
garvit3835 Jun 3, 2025
63b20a3
Merge branch 'garvit/aws-gpu-test' of https://github.com/devzero-inc/…
garvit3835 Jun 3, 2025
c6e19b9
using makefile to install zxporter in ci
garvit3835 Jun 3, 2025
44113e8
fixes in aws-gpu-test ci
garvit3835 Jun 6, 2025
bcdc404
fix in aws gpu test ci
garvit3835 Jun 6, 2025
432da67
fix aws-gpu-test ci
garvit3835 Jun 6, 2025
0ed7300
fix aws-gpu-test ci
garvit3835 Jun 7, 2025
e87559f
Added nvidia-device-plugin in AWS GPU test CI
garvit3835 Jun 7, 2025
eceac96
Added nvidia-device-plugin in AWS GPU test CI
garvit3835 Jun 7, 2025
a921e2f
Added nvidia-device-plugin in AWS GPU test CI
garvit3835 Jun 7, 2025
ca3ed87
Added nvidia-device-plugin in AWS GPU test CI
garvit3835 Jun 7, 2025
9696cd7
test karpenter in aws gpu test CI
garvit3835 Jun 9, 2025
98fa130
test karpenter in aws gpu test CI
garvit3835 Jun 9, 2025
6cbc66b
test karpenter in aws gpu test CI
garvit3835 Jun 9, 2025
2b195c8
test karpenter in aws gpu test CI
garvit3835 Jun 9, 2025
c6fc269
test karpenter in aws gpu test CI
garvit3835 Jun 9, 2025
270e04d
test karpenter in aws gpu test CI
garvit3835 Jun 9, 2025
f3844bb
test karpenter in aws gpu test CI
garvit3835 Jun 9, 2025
5a19315
alternate ci for karpenter with cloudformation
garvit3835 Jun 10, 2025
66da291
alternate ci for karpenter with cloudformation
garvit3835 Jun 10, 2025
d790f92
alternate ci for karpenter with cloudformation
garvit3835 Jun 10, 2025
6d70a34
karpenter in aws gpu test ci
garvit3835 Jun 10, 2025
3ffa2cf
karpenter in aws gpu test ci
garvit3835 Jun 10, 2025
975c0e3
karpenter in aws gpu test ci
garvit3835 Jun 10, 2025
f74d8ac
karpenter in aws gpu test ci
garvit3835 Jun 10, 2025
0c998dc
karpenter in aws gpu test ci
garvit3835 Jun 10, 2025
d5e4f90
karpenter in aws gpu test ci
garvit3835 Jun 10, 2025
4f8cb5a
karpenter in aws gpu test ci
garvit3835 Jun 10, 2025
5b19b52
karpenter in aws gpu test ci
garvit3835 Jun 10, 2025
c035dea
karpenter in aws gpu test ci
garvit3835 Jun 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
498 changes: 498 additions & 0 deletions .github/workflows/aws-gpu-test.yaml

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,8 @@ config/**/charts
*.swp
*.swo
*~

# Terraform files
*.tfstate
*.tfstate.backup
.terraform*
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,11 @@ help: ## Display this help.

.PHONY: manifests
manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases
$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases -w

.PHONY: generate
generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
$(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..."
$(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." -w

.PHONY: fmt
fmt: ## Run go fmt against code.
Expand Down
811 changes: 458 additions & 353 deletions config/prometheus/hack.prometheus.values.yaml

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dist/install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1229,4 +1229,4 @@ spec:
volumes:
- configMap:
name: devzero-zxporter-env-config
name: config-volume
name: config-volume
84 changes: 84 additions & 0 deletions nvidia-device-plugin-prereq/container-toolkit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-toolkit-installer
namespace: nvidia-device-plugin
spec:
selector:
matchLabels:
name: nvidia-toolkit-installer
template:
metadata:
labels:
name: nvidia-toolkit-installer
spec:
nodeSelector:
nvidia.com/gpu.present: "true"
hostPID: true
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
- key: "CriticalAddonsOnly"
operator: "Exists"
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
- effect: NoSchedule
key: node-role.kubernetes.io/master
containers:
- name: install-nvidia-toolkit
image: amazonlinux:2023
securityContext:
privileged: true
command:
- /bin/bash
- -c
- |
set -ex

# Add NVIDIA repo
curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
-o /etc/yum.repos.d/nvidia-container-toolkit.repo

# Install toolkit
yum install -y nvidia-container-toolkit

# Configure containerd
nvidia-ctk runtime configure --runtime=containerd

# Restart containerd
systemctl restart containerd || true

# Exit cleanly
echo "NVIDIA container toolkit installed and configured."
sleep infinity
volumeMounts:
- name: root
mountPath: /host
mountPropagation: Bidirectional
- name: containerd-config
mountPath: /etc/containerd
- name: systemd
mountPath: /run/systemd
- name: modules
mountPath: /lib/modules
readOnly: true
- name: dev
mountPath: /dev
volumes:
- name: root
hostPath:
path: /
- name: containerd-config
hostPath:
path: /etc/containerd
- name: systemd
hostPath:
path: /run/systemd
- name: modules
hostPath:
path: /lib/modules
- name: dev
hostPath:
path: /dev
restartPolicy: Always
81 changes: 81 additions & 0 deletions nvidia-device-plugin-prereq/driver-installer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-driver-installer
namespace: nvidia-device-plugin
spec:
selector:
matchLabels:
name: nvidia-driver-installer
template:
metadata:
labels:
name: nvidia-driver-installer
spec:
hostPID: true
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- key: CriticalAddonsOnly
operator: Exists
- key: node-role.kubernetes.io/control-plane
effect: NoSchedule
- key: node-role.kubernetes.io/master
effect: NoSchedule
nodeSelector:
nvidia.com/gpu.present: "true"
containers:
- name: driver-installer
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0
securityContext:
privileged: true
env:
- name: NVIDIA_DRIVER_VERSION
value: "535.129.03" # or the version you require
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: root
mountPath: /host
mountPropagation: Bidirectional
- name: modules
mountPath: /lib/modules
readOnly: true
- name: nvidia-local
mountPath: /host/usr/local/nvidia
- name: fix-dcgm-dir
image: amazonlinux:2023
securityContext:
privileged: true
command: ["/bin/bash", "-c"]
args:
- |
set -ex
TARGET_DIR="/host/usr/local/nvidia"
# If it doesn't exist, symlink something useful
if [ ! -d "$TARGET_DIR" ]; then
mkdir -p /host/usr/local
ln -s /usr/lib64 "$TARGET_DIR"
fi
echo "/usr/local/nvidia set up for DCGM."
sleep 10
volumeMounts:
- name: nvidia-local
mountPath: /host/usr/local/nvidia
- name: root
mountPath: /host
mountPropagation: Bidirectional
volumes:
- name: root
hostPath:
path: /
- name: modules
hostPath:
path: /lib/modules
- name: nvidia-local
hostPath:
path: /usr/local/nvidia
type: DirectoryOrCreate
Loading
Loading