@@ -37,51 +37,110 @@ import (
3737
3838const (
3939 kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml"
40+ monitoringYaml = "deployments/gpu_plugin/overlays/monitoring_shared-dev_nfd/kustomization.yaml"
41+ rmEnabledYaml = "deployments/gpu_plugin/overlays/fractional_resources//kustomization.yaml"
42+ nfdRulesYaml = "deployments/nfd/overlays/node-feature-rules/kustomization.yaml"
4043 containerName = "testcontainer"
4144 tfKustomizationYaml = "deployments/gpu_tensorflow_test/kustomization.yaml"
4245 tfPodName = "training-pod"
4346)
4447
4548func init () {
46- ginkgo .Describe ("GPU plugin [Device:gpu]" , describe )
49+ // This needs to be Ordered because only one GPU plugin can function on the node at once.
50+ ginkgo .Describe ("GPU plugin [Device:gpu]" , describe , ginkgo .Ordered )
51+ }
52+
53+ func createPluginAndVerifyExistence (f * framework.Framework , ctx context.Context , kustomizationPath , baseResource string ) {
54+ ginkgo .By ("deploying GPU plugin" )
55+ e2ekubectl .RunKubectlOrDie (f .Namespace .Name , "apply" , "-k" , filepath .Dir (kustomizationPath ))
56+
57+ ginkgo .By ("waiting for GPU plugin's availability" )
58+ podList , err := e2epod .WaitForPodsWithLabelRunningReady (ctx , f .ClientSet , f .Namespace .Name ,
59+ labels.Set {"app" : "intel-gpu-plugin" }.AsSelector (), 1 /* one replica */ , 100 * time .Second )
60+ if err != nil {
61+ e2edebug .DumpAllNamespaceInfo (ctx , f .ClientSet , f .Namespace .Name )
62+ e2ekubectl .LogFailedContainers (ctx , f .ClientSet , f .Namespace .Name , framework .Logf )
63+ framework .Failf ("unable to wait for all pods to be running and ready: %v" , err )
64+ }
65+
66+ ginkgo .By ("checking GPU plugin's securityContext" )
67+ if err = utils .TestPodsFileSystemInfo (podList .Items ); err != nil {
68+ framework .Failf ("container filesystem info checks failed: %v" , err )
69+ }
70+
71+ ginkgo .By ("checking if the resource is allocatable" )
72+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , v1 .ResourceName (baseResource ), 30 * time .Second , utils .WaitOpGreater ); err != nil {
73+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
74+ }
4775}
4876
4977func describe () {
5078 f := framework .NewDefaultFramework ("gpuplugin" )
5179 f .NamespacePodSecurityEnforceLevel = admissionapi .LevelPrivileged
5280
53- kustomizationPath , errFailedToLocateRepoFile := utils .LocateRepoFile (kustomizationYaml )
81+ vanillaPath , errFailedToLocateRepoFile := utils .LocateRepoFile (kustomizationYaml )
5482 if errFailedToLocateRepoFile != nil {
5583 framework .Failf ("unable to locate %q: %v" , kustomizationYaml , errFailedToLocateRepoFile )
5684 }
5785
58- ginkgo .BeforeEach (func (ctx context.Context ) {
59- ginkgo .By ("deploying GPU plugin" )
60- e2ekubectl .RunKubectlOrDie (f .Namespace .Name , "apply" , "-k" , filepath .Dir (kustomizationPath ))
61-
62- ginkgo .By ("waiting for GPU plugin's availability" )
63- podList , err := e2epod .WaitForPodsWithLabelRunningReady (ctx , f .ClientSet , f .Namespace .Name ,
64- labels.Set {"app" : "intel-gpu-plugin" }.AsSelector (), 1 /* one replica */ , 100 * time .Second )
65- if err != nil {
66- e2edebug .DumpAllNamespaceInfo (ctx , f .ClientSet , f .Namespace .Name )
67- e2ekubectl .LogFailedContainers (ctx , f .ClientSet , f .Namespace .Name , framework .Logf )
68- framework .Failf ("unable to wait for all pods to be running and ready: %v" , err )
69- }
70-
71- ginkgo .By ("checking GPU plugin's securityContext" )
72- if err = utils .TestPodsFileSystemInfo (podList .Items ); err != nil {
73- framework .Failf ("container filesystem info checks failed: %v" , err )
74- }
75- })
86+ monitoringPath , errFailedToLocateRepoFile := utils .LocateRepoFile (monitoringYaml )
87+ if errFailedToLocateRepoFile != nil {
88+ framework .Failf ("unable to locate %q: %v" , monitoringYaml , errFailedToLocateRepoFile )
89+ }
7690
77- ginkgo .Context ("When GPU resources are available [Resource:i915]" , func () {
78- ginkgo .BeforeEach (func (ctx context.Context ) {
79- ginkgo .By ("checking if the resource is allocatable" )
80- if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/i915" , 30 * time .Second ); err != nil {
81- framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
91+ nfdRulesPath , errFailedToLocateRepoFile := utils .LocateRepoFile (nfdRulesYaml )
92+ if errFailedToLocateRepoFile != nil {
93+ framework .Failf ("unable to locate %q: %v" , nfdRulesYaml , errFailedToLocateRepoFile )
94+ }
95+
96+ resourceManagerPath , errFailedToLocateRepoFile := utils .LocateRepoFile (rmEnabledYaml )
97+ if errFailedToLocateRepoFile != nil {
98+ framework .Failf ("unable to locate %q: %v" , rmEnabledYaml , errFailedToLocateRepoFile )
99+ }
100+
101+ ginkgo .Context ("When GPU plugin is deployed [Resource:i915]" , func () {
102+ ginkgo .AfterEach (func (ctx context.Context ) {
103+ framework .Logf ("Removing gpu-plugin manually" )
104+
105+ e2ekubectl .RunKubectlOrDie (f .Namespace .Name , "delete" , "-k" , filepath .Dir (vanillaPath ))
106+
107+ // Wait for resources to go to zero
108+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/i915" , 30 * time .Second , utils .WaitOpZero ); err != nil {
109+ framework .Failf ("unable to wait for nodes to have no resources: %v" , err )
82110 }
83111 })
112+
84113 ginkgo .It ("checks availability of GPU resources [App:busybox]" , func (ctx context.Context ) {
114+ createPluginAndVerifyExistence (f , ctx , vanillaPath , "gpu.intel.com/i915" )
115+
116+ podListFunc := framework .ListObjects (f .ClientSet .CoreV1 ().Pods (f .Namespace .Name ).List , metav1.ListOptions {})
117+
118+ pods , err := podListFunc (ctx )
119+ if err != nil {
120+ framework .Failf ("Couldn't list pods: %+v" , err )
121+ }
122+
123+ if len (pods .Items ) != 1 {
124+ framework .Failf ("Invalid amount of Pods listed %d" , len (pods .Items ))
125+ }
126+
127+ pluginPod := pods .Items [0 ]
128+
129+ ginkgo .By ("checking if CDI path is included in volumes" )
130+ found := false
131+ for _ , v := range pluginPod .Spec .Volumes {
132+ if v .HostPath != nil && v .HostPath .Path == "/var/run/cdi" {
133+ framework .Logf ("CDI volume found" )
134+ found = true
135+
136+ break
137+ }
138+ }
139+
140+ if ! found {
141+ framework .Fail ("Couldn't find CDI volume in GPU plugin deployment" )
142+ }
143+
85144 ginkgo .By ("submitting a pod requesting GPU resources" )
86145 podSpec := & v1.Pod {
87146 ObjectMeta : metav1.ObjectMeta {Name : "gpuplugin-tester" },
@@ -122,7 +181,41 @@ func describe() {
122181 framework .Logf ("found card and renderD from the log" )
123182 })
124183
184+ ginkgo .Context ("When [Deployment:monitoring] deployment is applied [Resource:i915]" , func () {
185+ ginkgo .It ("check if monitoring resource is available" , func (ctx context.Context ) {
186+ createPluginAndVerifyExistence (f , ctx , monitoringPath , "gpu.intel.com/i915" )
187+
188+ ginkgo .By ("checking if the monitoring resource is allocatable" )
189+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/i915_monitoring" , 30 * time .Second , utils .WaitOpGreater ); err != nil {
190+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
191+ }
192+ })
193+ })
194+
195+ ginkgo .Context ("When [Deployment:resourceManager] deployment is applied [Resource:i915]" , func () {
196+ ginkgo .It ("check if i915 resources is available" , func (ctx context.Context ) {
197+ e2ekubectl .RunKubectlOrDie (f .Namespace .Name , "apply" , "-k" , filepath .Dir (nfdRulesPath ))
198+
199+ createPluginAndVerifyExistence (f , ctx , resourceManagerPath , "gpu.intel.com/i915" )
200+
201+ // To speed up extended resource detection, let's restart NFD worker
202+ e2ekubectl .RunKubectlOrDie ("node-feature-discovery" , "rollout" , "restart" , "daemonset" , "nfd-worker" )
203+
204+ ginkgo .By ("checking if the millicores resource is allocatable" )
205+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/millicores" , 30 * time .Second , utils .WaitOpGreater ); err != nil {
206+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
207+ }
208+
209+ ginkgo .By ("checking if the tiles resource is allocatable" )
210+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/tiles" , 30 * time .Second , utils .WaitOpGreater ); err != nil {
211+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
212+ }
213+ })
214+ })
215+
125216 ginkgo .It ("run a small workload on the GPU [App:tensorflow]" , func (ctx context.Context ) {
217+ createPluginAndVerifyExistence (f , ctx , vanillaPath , "gpu.intel.com/i915" )
218+
126219 kustomYaml , err := utils .LocateRepoFile (tfKustomizationYaml )
127220 if err != nil {
128221 framework .Failf ("unable to locate %q: %v" , tfKustomizationYaml , err )
@@ -146,13 +239,9 @@ func describe() {
146239 })
147240
148241 ginkgo .Context ("When GPU resources are available [Resource:xe]" , func () {
149- ginkgo .BeforeEach (func (ctx context.Context ) {
150- ginkgo .By ("checking if the resource is allocatable" )
151- if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/xe" , 30 * time .Second ); err != nil {
152- framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
153- }
154- })
155242 ginkgo .It ("checks availability of GPU resources [App:busybox]" , func (ctx context.Context ) {
243+ createPluginAndVerifyExistence (f , ctx , vanillaPath , "gpu.intel.com/xe" )
244+
156245 ginkgo .By ("submitting a pod requesting GPU resources" )
157246 podSpec := & v1.Pod {
158247 ObjectMeta : metav1.ObjectMeta {Name : "gpuplugin-tester" },
0 commit comments