@@ -37,51 +37,112 @@ import (
3737
3838const (
3939 kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml"
40+ monitoringYaml = "deployments/gpu_plugin/overlays/monitoring_shared-dev_nfd/kustomization.yaml"
41+ rmEnabledYaml = "deployments/gpu_plugin/overlays/fractional_resources//kustomization.yaml"
42+ nfdRulesYaml = "deployments/nfd/overlays/node-feature-rules/kustomization.yaml"
4043 containerName = "testcontainer"
4144 tfKustomizationYaml = "deployments/gpu_tensorflow_test/kustomization.yaml"
4245 tfPodName = "training-pod"
4346)
4447
4548func init () {
46- ginkgo .Describe ("GPU plugin [Device:gpu]" , describe )
49+ // This needs to be Ordered because only one GPU plugin can function on the node at once.
50+ ginkgo .Describe ("GPU plugin [Device:gpu]" , describe , ginkgo .Ordered )
51+ }
52+
53+ func createPluginAndVerifyExistence (f * framework.Framework , ctx context.Context , kustomizationPath , baseResource string ) {
54+ ginkgo .By ("deploying GPU plugin" )
55+ e2ekubectl .RunKubectlOrDie (f .Namespace .Name , "apply" , "-k" , filepath .Dir (kustomizationPath ))
56+
57+ ginkgo .By ("waiting for GPU plugin's availability" )
58+ podList , err := e2epod .WaitForPodsWithLabelRunningReady (ctx , f .ClientSet , f .Namespace .Name ,
59+ labels.Set {"app" : "intel-gpu-plugin" }.AsSelector (), 1 /* one replica */ , 100 * time .Second )
60+ if err != nil {
61+ e2edebug .DumpAllNamespaceInfo (ctx , f .ClientSet , f .Namespace .Name )
62+ e2ekubectl .LogFailedContainers (ctx , f .ClientSet , f .Namespace .Name , framework .Logf )
63+ framework .Failf ("unable to wait for all pods to be running and ready: %v" , err )
64+ }
65+
66+ ginkgo .By ("checking GPU plugin's securityContext" )
67+ if err = utils .TestPodsFileSystemInfo (podList .Items ); err != nil {
68+ framework .Failf ("container filesystem info checks failed: %v" , err )
69+ }
70+
71+ ginkgo .By ("checking if the resource is allocatable" )
72+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , v1 .ResourceName (baseResource ), 30 * time .Second , utils .WaitForPositiveResource ); err != nil {
73+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
74+ }
4775}
4876
4977func describe () {
5078 f := framework .NewDefaultFramework ("gpuplugin" )
5179 f .NamespacePodSecurityEnforceLevel = admissionapi .LevelPrivileged
5280
53- kustomizationPath , errFailedToLocateRepoFile := utils .LocateRepoFile (kustomizationYaml )
81+ vanillaPath , errFailedToLocateRepoFile := utils .LocateRepoFile (kustomizationYaml )
5482 if errFailedToLocateRepoFile != nil {
5583 framework .Failf ("unable to locate %q: %v" , kustomizationYaml , errFailedToLocateRepoFile )
5684 }
5785
58- ginkgo . BeforeEach ( func ( ctx context. Context ) {
59- ginkgo . By ( "deploying GPU plugin" )
60- e2ekubectl . RunKubectlOrDie ( f . Namespace . Name , "apply" , "-k" , filepath . Dir ( kustomizationPath ) )
61-
62- ginkgo . By ( "waiting for GPU plugin's availability" )
63- podList , err := e2epod . WaitForPodsWithLabelRunningReady ( ctx , f . ClientSet , f . Namespace . Name ,
64- labels. Set { "app" : "intel-gpu-plugin" }. AsSelector (), 1 /* one replica */ , 100 * time . Second )
65- if err != nil {
66- e2edebug . DumpAllNamespaceInfo ( ctx , f . ClientSet , f . Namespace . Name )
67- e2ekubectl . LogFailedContainers ( ctx , f . ClientSet , f . Namespace . Name , framework . Logf )
68- framework . Failf ( "unable to wait for all pods to be running and ready: %v" , err )
69- }
70-
71- ginkgo . By ( "checking GPU plugin's securityContext" )
72- if err = utils . TestPodsFileSystemInfo ( podList . Items ); err != nil {
73- framework . Failf ( "container filesystem info checks failed: %v " , err )
74- }
75- } )
86+ monitoringPath , errFailedToLocateRepoFile := utils . LocateRepoFile ( monitoringYaml )
87+ if errFailedToLocateRepoFile != nil {
88+ framework . Failf ( "unable to locate %q: %v" , monitoringYaml , errFailedToLocateRepoFile )
89+ }
90+
91+ nfdRulesPath , errFailedToLocateRepoFile := utils . LocateRepoFile ( nfdRulesYaml )
92+ if errFailedToLocateRepoFile != nil {
93+ framework . Failf ( "unable to locate %q: %v" , nfdRulesYaml , errFailedToLocateRepoFile )
94+ }
95+
96+ resourceManagerPath , errFailedToLocateRepoFile := utils . LocateRepoFile ( rmEnabledYaml )
97+ if errFailedToLocateRepoFile != nil {
98+ framework . Failf ( "unable to locate %q: %v" , rmEnabledYaml , errFailedToLocateRepoFile )
99+ }
100+
101+ ginkgo . Context ( "When GPU plugin is deployed [Resource:i915] " , func () {
102+ ginkgo . AfterEach ( func ( ctx context. Context ) {
103+ framework . Logf ( "Removing gpu-plugin manually" )
76104
77- ginkgo .Context ("When GPU resources are available [Resource:i915]" , func () {
78- ginkgo .BeforeEach (func (ctx context.Context ) {
79- ginkgo .By ("checking if the resource is allocatable" )
80- if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/i915" , 30 * time .Second ); err != nil {
81- framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
105+ e2ekubectl .RunKubectlOrDie (f .Namespace .Name , "delete" , "-k" , filepath .Dir (vanillaPath ))
106+
107+ framework .Logf ("Waiting for i915 resources to go to zero" )
108+
109+ // Wait for resources to go to zero
110+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/i915" , 30 * time .Second , utils .WaitForZeroResource ); err != nil {
111+ framework .Failf ("unable to wait for nodes to have no resources: %v" , err )
82112 }
83113 })
114+
84115 ginkgo .It ("checks availability of GPU resources [App:busybox]" , func (ctx context.Context ) {
116+ createPluginAndVerifyExistence (f , ctx , vanillaPath , "gpu.intel.com/i915" )
117+
118+ podListFunc := framework .ListObjects (f .ClientSet .CoreV1 ().Pods (f .Namespace .Name ).List , metav1.ListOptions {})
119+
120+ pods , err := podListFunc (ctx )
121+ if err != nil {
122+ framework .Failf ("Couldn't list pods: %+v" , err )
123+ }
124+
125+ if len (pods .Items ) != 1 {
126+ framework .Failf ("Invalid amount of Pods listed %d" , len (pods .Items ))
127+ }
128+
129+ pluginPod := pods .Items [0 ]
130+
131+ ginkgo .By ("checking if CDI path is included in volumes" )
132+ found := false
133+ for _ , v := range pluginPod .Spec .Volumes {
134+ if v .HostPath != nil && v .HostPath .Path == "/var/run/cdi" {
135+ framework .Logf ("CDI volume found" )
136+ found = true
137+
138+ break
139+ }
140+ }
141+
142+ if ! found {
143+ framework .Fail ("Couldn't find CDI volume in GPU plugin deployment" )
144+ }
145+
85146 ginkgo .By ("submitting a pod requesting GPU resources" )
86147 podSpec := & v1.Pod {
87148 ObjectMeta : metav1.ObjectMeta {Name : "gpuplugin-tester" },
@@ -122,7 +183,41 @@ func describe() {
122183 framework .Logf ("found card and renderD from the log" )
123184 })
124185
186+ ginkgo .Context ("When [Deployment:monitoring] deployment is applied [Resource:i915]" , func () {
187+ ginkgo .It ("check if monitoring resource is available" , func (ctx context.Context ) {
188+ createPluginAndVerifyExistence (f , ctx , monitoringPath , "gpu.intel.com/i915" )
189+
190+ ginkgo .By ("checking if the monitoring resource is allocatable" )
191+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/i915_monitoring" , 30 * time .Second , utils .WaitForPositiveResource ); err != nil {
192+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
193+ }
194+ })
195+ })
196+
197+ ginkgo .Context ("When [Deployment:resourceManager] deployment is applied [Resource:i915]" , func () {
198+ ginkgo .It ("check if i915 resources is available" , func (ctx context.Context ) {
199+ e2ekubectl .RunKubectlOrDie (f .Namespace .Name , "apply" , "-k" , filepath .Dir (nfdRulesPath ))
200+
201+ createPluginAndVerifyExistence (f , ctx , resourceManagerPath , "gpu.intel.com/i915" )
202+
203+ // To speed up extended resource detection, let's restart NFD worker
204+ e2ekubectl .RunKubectlOrDie ("node-feature-discovery" , "rollout" , "restart" , "daemonset" , "nfd-worker" )
205+
206+ ginkgo .By ("checking if the millicores resource is allocatable" )
207+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/millicores" , 30 * time .Second , utils .WaitForPositiveResource ); err != nil {
208+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
209+ }
210+
211+ ginkgo .By ("checking if the tiles resource is allocatable" )
212+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/tiles" , 30 * time .Second , utils .WaitForPositiveResource ); err != nil {
213+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
214+ }
215+ })
216+ })
217+
125218 ginkgo .It ("run a small workload on the GPU [App:tensorflow]" , func (ctx context.Context ) {
219+ createPluginAndVerifyExistence (f , ctx , vanillaPath , "gpu.intel.com/i915" )
220+
126221 kustomYaml , err := utils .LocateRepoFile (tfKustomizationYaml )
127222 if err != nil {
128223 framework .Failf ("unable to locate %q: %v" , tfKustomizationYaml , err )
@@ -146,13 +241,9 @@ func describe() {
146241 })
147242
148243 ginkgo .Context ("When GPU resources are available [Resource:xe]" , func () {
149- ginkgo .BeforeEach (func (ctx context.Context ) {
150- ginkgo .By ("checking if the resource is allocatable" )
151- if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/xe" , 30 * time .Second ); err != nil {
152- framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
153- }
154- })
155244 ginkgo .It ("checks availability of GPU resources [App:busybox]" , func (ctx context.Context ) {
245+ createPluginAndVerifyExistence (f , ctx , vanillaPath , "gpu.intel.com/xe" )
246+
156247 ginkgo .By ("submitting a pod requesting GPU resources" )
157248 podSpec := & v1.Pod {
158249 ObjectMeta : metav1.ObjectMeta {Name : "gpuplugin-tester" },
0 commit comments