• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

NVIDIA / gpu-operator / 20858835142

09 Jan 2026 04:42PM UTC coverage: 23.925% (+0.3%) from 23.613%
20858835142

push

github

karthikvetrivel
Add driver configuration digest computation and driver type support

Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>

53 of 90 new or added lines in 3 files covered. (58.89%)

1 existing line in 1 file now uncovered.

2850 of 11912 relevant lines covered (23.93%)

0.27 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

51.28
/controllers/object_controls.go
1
/**
2
# Copyright (c) NVIDIA CORPORATION.  All rights reserved.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
**/
16

17
package controllers
18

19
import (
20
        "bufio"
21
        "context"
22
        "errors"
23
        "fmt"
24
        "os"
25
        "path"
26
        "regexp"
27
        "sort"
28
        "strconv"
29
        "strings"
30

31
        "path/filepath"
32

33
        apiconfigv1 "github.com/openshift/api/config/v1"
34
        apiimagev1 "github.com/openshift/api/image/v1"
35
        secv1 "github.com/openshift/api/security/v1"
36
        promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
37
        "golang.org/x/mod/semver"
38
        appsv1 "k8s.io/api/apps/v1"
39
        corev1 "k8s.io/api/core/v1"
40
        nodev1 "k8s.io/api/node/v1"
41
        nodev1beta1 "k8s.io/api/node/v1beta1"
42
        apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
43
        apierrors "k8s.io/apimachinery/pkg/api/errors"
44
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
45
        "k8s.io/apimachinery/pkg/runtime/schema"
46
        "k8s.io/apimachinery/pkg/types"
47
        "k8s.io/apimachinery/pkg/util/intstr"
48
        "k8s.io/utils/ptr"
49
        "sigs.k8s.io/controller-runtime/pkg/client"
50
        "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
51
        "sigs.k8s.io/yaml"
52

53
        gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
54
        "github.com/NVIDIA/gpu-operator/internal/consts"
55
        "github.com/NVIDIA/gpu-operator/internal/utils"
56
)
57

58
const (
59
        // DefaultContainerdConfigFile indicates default config file path for containerd
60
        DefaultContainerdConfigFile = "/etc/containerd/config.toml"
61
        // DefaultContainerdDropInConfigFile indicates default drop-in config file path for containerd
62
        DefaultContainerdDropInConfigFile = "/etc/containerd/conf.d/99-nvidia.toml"
63
        // DefaultContainerdSocketFile indicates default containerd socket file
64
        DefaultContainerdSocketFile = "/run/containerd/containerd.sock"
65
        // DefaultDockerConfigFile indicates default config file path for docker
66
        DefaultDockerConfigFile = "/etc/docker/daemon.json"
67
        // DefaultDockerSocketFile indicates default docker socket file
68
        DefaultDockerSocketFile = "/var/run/docker.sock"
69
        // DefaultCRIOConfigFile indicates default config file path for cri-o. .
70
        DefaultCRIOConfigFile = "/etc/crio/config.toml"
71
        // DefaultCRIODropInConfigFile indicates the default path to the drop-in config file for cri-o
72
        DefaultCRIODropInConfigFile = "/etc/crio/crio.conf.d/99-nvidia.conf"
73
        // TrustedCAConfigMapName indicates configmap with custom user CA injected
74
        TrustedCAConfigMapName = "gpu-operator-trusted-ca"
75
        // TrustedCABundleFileName indicates custom user ca certificate filename
76
        TrustedCABundleFileName = "ca-bundle.crt"
77
        // TrustedCABundleMountDir indicates target mount directory of user ca bundle
78
        TrustedCABundleMountDir = "/etc/pki/ca-trust/extracted/pem"
79
        // TrustedCACertificate indicates injected CA certificate name
80
        TrustedCACertificate = "tls-ca-bundle.pem"
81
        // DefaultRuntimeClass represents "nvidia" RuntimeClass
82
        DefaultRuntimeClass = "nvidia"
83
        // DriverInstallPathVolName represents volume name for driver install path provided to toolkit
84
        DriverInstallPathVolName = "driver-install-path"
85
        // DefaultRuntimeSocketTargetDir represents target directory where runtime socket dirctory will be mounted
86
        DefaultRuntimeSocketTargetDir = "/runtime/sock-dir/"
87
        // DefaultRuntimeConfigTargetDir represents target directory where runtime socket dirctory will be mounted
88
        DefaultRuntimeConfigTargetDir = "/runtime/config-dir/"
89
        // DefaultRuntimeDropInConfigTargetDir represents target directory where drop-in config directory will be mounted
90
        DefaultRuntimeDropInConfigTargetDir = "/runtime/config-dir.d/"
91
        // ValidatorImageEnvName indicates env name for validator image passed
92
        ValidatorImageEnvName = "VALIDATOR_IMAGE"
93
        // ValidatorImagePullPolicyEnvName indicates env name for validator image pull policy passed
94
        ValidatorImagePullPolicyEnvName = "VALIDATOR_IMAGE_PULL_POLICY"
95
        // ValidatorImagePullSecretsEnvName indicates env name for validator image pull secrets passed
96
        ValidatorImagePullSecretsEnvName = "VALIDATOR_IMAGE_PULL_SECRETS"
97
        // ValidatorRuntimeClassEnvName indicates env name of runtime class to be applied to validator pods
98
        ValidatorRuntimeClassEnvName = "VALIDATOR_RUNTIME_CLASS"
99
        // MigStrategyEnvName indicates env name for passing MIG strategy
100
        MigStrategyEnvName = "MIG_STRATEGY"
101
        // MigPartedDefaultConfigMapName indicates name of ConfigMap containing default mig-parted config
102
        MigPartedDefaultConfigMapName = "default-mig-parted-config"
103
        // MigDefaultGPUClientsConfigMapName indicates name of ConfigMap containing default gpu-clients
104
        MigDefaultGPUClientsConfigMapName = "default-gpu-clients"
105
        // DCGMRemoteEngineEnvName indicates env name to specify remote DCGM host engine ip:port
106
        DCGMRemoteEngineEnvName = "DCGM_REMOTE_HOSTENGINE_INFO"
107
        // DCGMDefaultPort indicates default port bound to DCGM host engine
108
        DCGMDefaultPort = 5555
109
        // GPUDirectRDMAEnabledEnvName indicates if GPU direct RDMA is enabled through GPU operator
110
        GPUDirectRDMAEnabledEnvName = "GPU_DIRECT_RDMA_ENABLED"
111
        // UseHostMOFEDEnvName indicates if MOFED driver is pre-installed on the host
112
        UseHostMOFEDEnvName = "USE_HOST_MOFED"
113
        // MetricsConfigMountPath indicates mount path for custom dcgm metrics file
114
        MetricsConfigMountPath = "/etc/dcgm-exporter/" + MetricsConfigFileName
115
        // MetricsConfigFileName indicates custom dcgm metrics file name
116
        MetricsConfigFileName = "dcgm-metrics.csv"
117
        // NvidiaAnnotationHashKey indicates annotation name for last applied hash by gpu-operator
118
        NvidiaAnnotationHashKey = "nvidia.com/last-applied-hash"
119
        // NvidiaDisableRequireEnvName is the env name to disable default cuda constraints
120
        NvidiaDisableRequireEnvName = "NVIDIA_DISABLE_REQUIRE"
121
        // GDSEnabledEnvName is the env name to enable GDS support with device-plugin
122
        GDSEnabledEnvName = "GDS_ENABLED"
123
        // MOFEDEnabledEnvName is the env name to enable MOFED devices injection with device-plugin
124
        MOFEDEnabledEnvName = "MOFED_ENABLED"
125
        // GDRCopyEnabledEnvName is the envvar that enables injection of the GDRCopy device node with the device-plugin
126
        GDRCopyEnabledEnvName = "GDRCOPY_ENABLED"
127
        // ServiceMonitorCRDName is the name of the CRD defining the ServiceMonitor kind
128
        ServiceMonitorCRDName = "servicemonitors.monitoring.coreos.com"
129
        // DefaultToolkitInstallDir is the default toolkit installation directory on the host
130
        DefaultToolkitInstallDir = "/usr/local/nvidia"
131
        // ToolkitInstallDirEnvName is the name of the toolkit container env for configuring where NVIDIA Container Toolkit is installed
132
        ToolkitInstallDirEnvName = "ROOT"
133
        // VgpuDMDefaultConfigMapName indicates name of ConfigMap containing default vGPU devices configuration
134
        VgpuDMDefaultConfigMapName = "default-vgpu-devices-config"
135
        // VgpuDMDefaultConfigName indicates name of default configuration in the vGPU devices config file
136
        VgpuDMDefaultConfigName = "default"
137
        // NvidiaCtrRuntimeModeEnvName is the name of the toolkit container env for configuring the NVIDIA Container Runtime mode
138
        NvidiaCtrRuntimeModeEnvName = "NVIDIA_CONTAINER_RUNTIME_MODE"
139
        // NvidiaCtrRuntimeCDIPrefixesEnvName is the name of toolkit container env for configuring the CDI annotation prefixes
140
        NvidiaCtrRuntimeCDIPrefixesEnvName = "NVIDIA_CONTAINER_RUNTIME_MODES_CDI_ANNOTATION_PREFIXES"
141
        // CDIEnabledEnvName is the name of the envvar used to enable CDI in the operands
142
        CDIEnabledEnvName = "CDI_ENABLED"
143
        // NvidiaCDIHookPathEnvName is the name of the envvar specifying the path to the 'nvidia-cdi-hook' binary
144
        NvidiaCDIHookPathEnvName = "NVIDIA_CDI_HOOK_PATH"
145
        // CRIOConfigModeEnvName is the name of the envvar controlling how the toolkit container updates the cri-o configuration
146
        CRIOConfigModeEnvName = "CRIO_CONFIG_MODE"
147
        // DeviceListStrategyEnvName is the name of the envvar for configuring the device-list-strategy in the device-plugin
148
        DeviceListStrategyEnvName = "DEVICE_LIST_STRATEGY"
149
        // CDIAnnotationPrefixEnvName is the name of the device-plugin envvar for configuring the CDI annotation prefix
150
        CDIAnnotationPrefixEnvName = "CDI_ANNOTATION_PREFIX"
151
        // KataManagerAnnotationHashKey is the annotation indicating the hash of the kata-manager configuration
152
        KataManagerAnnotationHashKey = "nvidia.com/kata-manager.last-applied-hash"
153
        // DefaultKataArtifactsDir is the default directory to store kata artifacts on the host
154
        DefaultKataArtifactsDir = "/opt/nvidia-gpu-operator/artifacts/runtimeclasses/"
155
        // PodControllerRevisionHashLabelKey is the annotation key for pod controller revision hash value
156
        PodControllerRevisionHashLabelKey = "controller-revision-hash"
157
        // DefaultCCModeEnvName is the name of the envvar for configuring default CC mode on all compatible GPUs on the node
158
        DefaultCCModeEnvName = "DEFAULT_CC_MODE"
159
        // OpenKernelModulesEnabledEnvName is the name of the driver-container envvar for enabling open GPU kernel module support
160
        OpenKernelModulesEnabledEnvName = "OPEN_KERNEL_MODULES_ENABLED"
161
        // KernelModuleTypeEnvName is the name of the driver-container envvar to set the desired kernel module type
162
        KernelModuleTypeEnvName = "KERNEL_MODULE_TYPE"
163
        // DriverTypeEnvName is the name of the driver-container envvar to set the driver type
164
        DriverTypeEnvName = "DRIVER_TYPE"
165
        // MPSRootEnvName is the name of the envvar for configuring the MPS root
166
        MPSRootEnvName = "MPS_ROOT"
167
        // DefaultMPSRoot is the default MPS root path on the host
168
        DefaultMPSRoot = "/run/nvidia/mps"
169
        // HostRootEnvName is the name of the envvar representing the root path of the underlying host
170
        HostRootEnvName = "HOST_ROOT"
171
        // DefaultDriverInstallDir represents the default path of a driver container installation
172
        DefaultDriverInstallDir = "/run/nvidia/driver"
173
        // DriverInstallDirEnvName is the name of the envvar used by the driver-validator to represent the driver install dir
174
        DriverInstallDirEnvName = "DRIVER_INSTALL_DIR"
175
        // DriverInstallDirCtrPathEnvName is the name of the envvar used by the driver-validator to represent the path
176
        // of the driver install dir mounted in the container
177
        DriverInstallDirCtrPathEnvName = "DRIVER_INSTALL_DIR_CTR_PATH"
178
        // NvidiaRuntimeSetAsDefaultEnvName is the name of the toolkit container env for configuring NVIDIA Container Runtime as the default runtime
179
        NvidiaRuntimeSetAsDefaultEnvName = "NVIDIA_RUNTIME_SET_AS_DEFAULT"
180
)
181

182
// ContainerProbe defines container probe types
183
type ContainerProbe string
184

185
const (
186
        // Startup probe
187
        Startup ContainerProbe = "startup"
188
        // Liveness probe
189
        Liveness ContainerProbe = "liveness"
190
        // Readiness probe
191
        Readiness ContainerProbe = "readiness"
192
)
193

194
// rootUID represents user 0
195
var rootUID = ptr.To(int64(0))
196

197
// RepoConfigPathMap indicates standard OS specific paths for repository configuration files
198
var RepoConfigPathMap = map[string]string{
199
        "centos": "/etc/yum.repos.d",
200
        "ubuntu": "/etc/apt/sources.list.d",
201
        "rhcos":  "/etc/yum.repos.d",
202
        "rhel":   "/etc/yum.repos.d",
203
}
204

205
// CertConfigPathMap indicates standard OS specific paths for ssl keys/certificates.
206
// Where Go looks for certs: https://golang.org/src/crypto/x509/root_linux.go
207
// Where OCP mounts proxy certs on RHCOS nodes:
208
// https://access.redhat.com/documentation/en-us/openshift_container_platform/4.3/html/authentication/ocp-certificates#proxy-certificates_ocp-certificates
209
var CertConfigPathMap = map[string]string{
210
        "centos": "/etc/pki/ca-trust/extracted/pem",
211
        "ubuntu": "/usr/local/share/ca-certificates",
212
        "rhcos":  "/etc/pki/ca-trust/extracted/pem",
213
        "rhel":   "/etc/pki/ca-trust/extracted/pem",
214
}
215

216
// MountPathToVolumeSource maps a container mount path to a VolumeSource
217
type MountPathToVolumeSource map[string]corev1.VolumeSource
218

219
// SubscriptionPathMap contains information on OS-specific paths
220
// that provide entitlements/subscription details on the host.
221
// These are used to enable Driver Container's access to packages controlled by
222
// the distro through their subscription and support program.
223
var SubscriptionPathMap = map[string](MountPathToVolumeSource){
224
        "rhel": {
225
                "/run/secrets/etc-pki-entitlement": corev1.VolumeSource{
226
                        HostPath: &corev1.HostPathVolumeSource{
227
                                Path: "/etc/pki/entitlement",
228
                                Type: ptr.To(corev1.HostPathDirectory),
229
                        },
230
                },
231
                "/run/secrets/redhat.repo": corev1.VolumeSource{
232
                        HostPath: &corev1.HostPathVolumeSource{
233
                                Path: "/etc/yum.repos.d/redhat.repo",
234
                                Type: ptr.To(corev1.HostPathFile),
235
                        },
236
                },
237
                "/run/secrets/rhsm": corev1.VolumeSource{
238
                        HostPath: &corev1.HostPathVolumeSource{
239
                                Path: "/etc/rhsm",
240
                                Type: ptr.To(corev1.HostPathDirectory),
241
                        },
242
                },
243
        },
244
        "rhcos": {
245
                "/run/secrets/etc-pki-entitlement": corev1.VolumeSource{
246
                        HostPath: &corev1.HostPathVolumeSource{
247
                                Path: "/etc/pki/entitlement",
248
                                Type: ptr.To(corev1.HostPathDirectory),
249
                        },
250
                },
251
                "/run/secrets/redhat.repo": corev1.VolumeSource{
252
                        HostPath: &corev1.HostPathVolumeSource{
253
                                Path: "/etc/yum.repos.d/redhat.repo",
254
                                Type: ptr.To(corev1.HostPathFile),
255
                        },
256
                },
257
                "/run/secrets/rhsm": corev1.VolumeSource{
258
                        HostPath: &corev1.HostPathVolumeSource{
259
                                Path: "/etc/rhsm",
260
                                Type: ptr.To(corev1.HostPathDirectory),
261
                        },
262
                },
263
        },
264
        "sles": {
265
                "/etc/zypp/credentials.d": corev1.VolumeSource{
266
                        HostPath: &corev1.HostPathVolumeSource{
267
                                Path: "/etc/zypp/credentials.d",
268
                                Type: ptr.To(corev1.HostPathDirectory),
269
                        },
270
                },
271
                "/etc/SUSEConnect": corev1.VolumeSource{
272
                        HostPath: &corev1.HostPathVolumeSource{
273
                                Path: "/etc/SUSEConnect",
274
                                Type: ptr.To(corev1.HostPathFileOrCreate),
275
                        },
276
                },
277
        },
278
        "sl-micro": {
279
                "/etc/zypp/credentials.d": corev1.VolumeSource{
280
                        HostPath: &corev1.HostPathVolumeSource{
281
                                Path: "/etc/zypp/credentials.d",
282
                                Type: ptr.To(corev1.HostPathDirectory),
283
                        },
284
                },
285
                "/etc/SUSEConnect": corev1.VolumeSource{
286
                        HostPath: &corev1.HostPathVolumeSource{
287
                                Path: "/etc/SUSEConnect",
288
                                Type: ptr.To(corev1.HostPathFileOrCreate),
289
                        },
290
                },
291
        },
292
}
293

294
type controlFunc []func(n ClusterPolicyController) (gpuv1.State, error)
295

296
// ServiceAccount creates ServiceAccount resource
297
func ServiceAccount(n ClusterPolicyController) (gpuv1.State, error) {
1✔
298
        ctx := n.ctx
1✔
299
        state := n.idx
1✔
300
        obj := n.resources[state].ServiceAccount.DeepCopy()
1✔
301
        obj.Namespace = n.operatorNamespace
1✔
302

1✔
303
        logger := n.logger.WithValues("ServiceAccount", obj.Name, "Namespace", obj.Namespace)
1✔
304

1✔
305
        // Check if state is disabled and cleanup resource if exists
1✔
306
        if !n.isStateEnabled(n.stateNames[n.idx]) {
1✔
307
                err := n.client.Delete(ctx, obj)
×
308
                if err != nil && !apierrors.IsNotFound(err) {
×
309
                        logger.Info("Couldn't delete", "Error", err)
×
310
                        return gpuv1.NotReady, err
×
311
                }
×
312
                return gpuv1.Disabled, nil
×
313
        }
314

315
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
316
                return gpuv1.NotReady, err
×
317
        }
×
318

319
        if err := n.client.Create(ctx, obj); err != nil {
1✔
320
                if apierrors.IsAlreadyExists(err) {
×
321
                        logger.Info("Found Resource, skipping update")
×
322
                        return gpuv1.Ready, nil
×
323
                }
×
324

325
                logger.Info("Couldn't create", "Error", err)
×
326
                return gpuv1.NotReady, err
×
327
        }
328
        return gpuv1.Ready, nil
1✔
329
}
330

331
// Role creates Role resource
332
func Role(n ClusterPolicyController) (gpuv1.State, error) {
1✔
333
        ctx := n.ctx
1✔
334
        state := n.idx
1✔
335
        obj := n.resources[state].Role.DeepCopy()
1✔
336
        obj.Namespace = n.operatorNamespace
1✔
337

1✔
338
        logger := n.logger.WithValues("Role", obj.Name, "Namespace", obj.Namespace)
1✔
339

1✔
340
        // Check if state is disabled and cleanup resource if exists
1✔
341
        if !n.isStateEnabled(n.stateNames[n.idx]) {
1✔
342
                err := n.client.Delete(ctx, obj)
×
343
                if err != nil && !apierrors.IsNotFound(err) {
×
344
                        logger.Info("Couldn't delete", "Error", err)
×
345
                        return gpuv1.NotReady, err
×
346
                }
×
347
                return gpuv1.Disabled, nil
×
348
        }
349

350
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
351
                return gpuv1.NotReady, err
×
352
        }
×
353

354
        if err := n.client.Create(ctx, obj); err != nil {
1✔
355
                if apierrors.IsAlreadyExists(err) {
×
356
                        logger.Info("Found Resource, updating...")
×
357
                        err = n.client.Update(ctx, obj)
×
358
                        if err != nil {
×
359
                                logger.Info("Couldn't update", "Error", err)
×
360
                                return gpuv1.NotReady, err
×
361
                        }
×
362
                        return gpuv1.Ready, nil
×
363
                }
364

365
                logger.Info("Couldn't create", "Error", err)
×
366
                return gpuv1.NotReady, err
×
367
        }
368

369
        return gpuv1.Ready, nil
1✔
370
}
371

372
// RoleBinding creates RoleBinding resource
373
func RoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
1✔
374
        ctx := n.ctx
1✔
375
        state := n.idx
1✔
376
        obj := n.resources[state].RoleBinding.DeepCopy()
1✔
377
        obj.Namespace = n.operatorNamespace
1✔
378

1✔
379
        logger := n.logger.WithValues("RoleBinding", obj.Name, "Namespace", obj.Namespace)
1✔
380

1✔
381
        // Check if state is disabled and cleanup resource if exists
1✔
382
        if !n.isStateEnabled(n.stateNames[n.idx]) {
1✔
383
                err := n.client.Delete(ctx, obj)
×
384
                if err != nil && !apierrors.IsNotFound(err) {
×
385
                        logger.Info("Couldn't delete", "Error", err)
×
386
                        return gpuv1.NotReady, err
×
387
                }
×
388
                return gpuv1.Disabled, nil
×
389
        }
390

391
        for idx := range obj.Subjects {
2✔
392
                // we don't want to update ALL the Subjects[].Namespace, eg we need to keep 'openshift-monitoring'
1✔
393
                // for allowing PrometheusOperator to scrape our metrics resources:
1✔
394
                // see in assets/state-dcgm-exporter, 0500_prom_rolebinding_openshift.yaml vs 0300_rolebinding.yaml
1✔
395
                if obj.Subjects[idx].Namespace != "FILLED BY THE OPERATOR" {
1✔
396
                        continue
×
397
                }
398
                obj.Subjects[idx].Namespace = n.operatorNamespace
1✔
399
        }
400

401
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
402
                return gpuv1.NotReady, err
×
403
        }
×
404

405
        if err := n.client.Create(ctx, obj); err != nil {
1✔
406
                if apierrors.IsAlreadyExists(err) {
×
407
                        logger.Info("Found Resource, updating...")
×
408
                        err = n.client.Update(ctx, obj)
×
409
                        if err != nil {
×
410
                                logger.Info("Couldn't update", "Error", err)
×
411
                                return gpuv1.NotReady, err
×
412
                        }
×
413
                        return gpuv1.Ready, nil
×
414
                }
415

416
                logger.Info("Couldn't create", "Error", err)
×
417
                return gpuv1.NotReady, err
×
418
        }
419

420
        return gpuv1.Ready, nil
1✔
421
}
422

423
// ClusterRole creates ClusterRole resource
424
func ClusterRole(n ClusterPolicyController) (gpuv1.State, error) {
1✔
425
        ctx := n.ctx
1✔
426
        state := n.idx
1✔
427
        obj := n.resources[state].ClusterRole.DeepCopy()
1✔
428
        obj.Namespace = n.operatorNamespace
1✔
429

1✔
430
        logger := n.logger.WithValues("ClusterRole", obj.Name, "Namespace", obj.Namespace)
1✔
431

1✔
432
        // Check if state is disabled and cleanup resource if exists
1✔
433
        if !n.isStateEnabled(n.stateNames[n.idx]) {
1✔
434
                err := n.client.Delete(ctx, obj)
×
435
                if err != nil && !apierrors.IsNotFound(err) {
×
436
                        logger.Info("Couldn't delete", "Error", err)
×
437
                        return gpuv1.NotReady, err
×
438
                }
×
439
                return gpuv1.Disabled, nil
×
440
        }
441

442
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
443
                return gpuv1.NotReady, err
×
444
        }
×
445

446
        if err := n.client.Create(ctx, obj); err != nil {
1✔
447
                if apierrors.IsAlreadyExists(err) {
×
448
                        logger.Info("Found Resource, updating...")
×
449
                        err = n.client.Update(ctx, obj)
×
450
                        if err != nil {
×
451
                                logger.Info("Couldn't update", "Error", err)
×
452
                                return gpuv1.NotReady, err
×
453
                        }
×
454
                        return gpuv1.Ready, nil
×
455
                }
456

457
                logger.Info("Couldn't create", "Error", err)
×
458
                return gpuv1.NotReady, err
×
459
        }
460

461
        return gpuv1.Ready, nil
1✔
462
}
463

464
// ClusterRoleBinding creates ClusterRoleBinding resource
465
func ClusterRoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
1✔
466
        ctx := n.ctx
1✔
467
        state := n.idx
1✔
468
        obj := n.resources[state].ClusterRoleBinding.DeepCopy()
1✔
469
        obj.Namespace = n.operatorNamespace
1✔
470

1✔
471
        logger := n.logger.WithValues("ClusterRoleBinding", obj.Name, "Namespace", obj.Namespace)
1✔
472

1✔
473
        // Check if state is disabled and cleanup resource if exists
1✔
474
        if !n.isStateEnabled(n.stateNames[n.idx]) {
1✔
475
                err := n.client.Delete(ctx, obj)
×
476
                if err != nil && !apierrors.IsNotFound(err) {
×
477
                        logger.Info("Couldn't delete", "Error", err)
×
478
                        return gpuv1.NotReady, err
×
479
                }
×
480
                return gpuv1.Disabled, nil
×
481
        }
482

483
        for idx := range obj.Subjects {
2✔
484
                obj.Subjects[idx].Namespace = n.operatorNamespace
1✔
485
        }
1✔
486

487
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
488
                return gpuv1.NotReady, err
×
489
        }
×
490

491
        if err := n.client.Create(ctx, obj); err != nil {
1✔
492
                if apierrors.IsAlreadyExists(err) {
×
493
                        logger.Info("Found Resource, updating...")
×
494
                        err = n.client.Update(ctx, obj)
×
495
                        if err != nil {
×
496
                                logger.Info("Couldn't update", "Error", err)
×
497
                                return gpuv1.NotReady, err
×
498
                        }
×
499
                        return gpuv1.Ready, nil
×
500
                }
501

502
                logger.Info("Couldn't create", "Error", err)
×
503
                return gpuv1.NotReady, err
×
504
        }
505

506
        return gpuv1.Ready, nil
1✔
507
}
508

509
// createConfigMap creates a ConfigMap resource
510
func createConfigMap(n ClusterPolicyController, configMapIdx int) (gpuv1.State, error) {
1✔
511
        ctx := n.ctx
1✔
512
        state := n.idx
1✔
513
        config := n.singleton.Spec
1✔
514
        obj := n.resources[state].ConfigMaps[configMapIdx].DeepCopy()
1✔
515
        obj.Namespace = n.operatorNamespace
1✔
516

1✔
517
        logger := n.logger.WithValues("ConfigMap", obj.Name, "Namespace", obj.Namespace)
1✔
518

1✔
519
        // Check if state is disabled and cleanup resource if exists
1✔
520
        if !n.isStateEnabled(n.stateNames[n.idx]) {
1✔
521
                err := n.client.Delete(ctx, obj)
×
522
                if err != nil && !apierrors.IsNotFound(err) {
×
523
                        logger.Info("Couldn't delete", "Error", err)
×
524
                        return gpuv1.NotReady, err
×
525
                }
×
526
                return gpuv1.Disabled, nil
×
527
        }
528

529
        // avoid creating default 'mig-parted-config' ConfigMap if custom one is provided
530
        if obj.Name == MigPartedDefaultConfigMapName {
1✔
531
                if name, isCustom := gpuv1.GetConfigMapName(config.MIGManager.Config, MigPartedDefaultConfigMapName); isCustom {
×
532
                        logger.Info("Not creating resource, custom ConfigMap provided", "Name", name)
×
533
                        return gpuv1.Ready, nil
×
534
                }
×
535
        }
536

537
        // avoid creating default 'gpu-clients' ConfigMap if custom one is provided
538
        if obj.Name == MigDefaultGPUClientsConfigMapName {
1✔
539
                if name, isCustom := gpuv1.GetConfigMapName(config.MIGManager.GPUClientsConfig, MigDefaultGPUClientsConfigMapName); isCustom {
×
540
                        logger.Info("Not creating resource, custom ConfigMap provided", "Name", name)
×
541
                        return gpuv1.Ready, nil
×
542
                }
×
543
        }
544

545
        // avoid creating default vGPU device manager ConfigMap if custom one provided
546
        if obj.Name == VgpuDMDefaultConfigMapName {
1✔
547
                if name, isCustom := gpuv1.GetConfigMapName(config.VGPUDeviceManager.Config, VgpuDMDefaultConfigMapName); isCustom {
×
548
                        logger.Info("Not creating resource, custom ConfigMap provided", "Name", name)
×
549
                        return gpuv1.Ready, nil
×
550
                }
×
551
        }
552

553
        if obj.Name == "nvidia-kata-manager-config" {
1✔
554
                data, err := yaml.Marshal(config.KataManager.Config)
×
555
                if err != nil {
×
556
                        return gpuv1.NotReady, fmt.Errorf("failed to marshal kata manager config: %v", err)
×
557
                }
×
558
                obj.Data = map[string]string{
×
559
                        "config.yaml": string(data),
×
560
                }
×
561
        }
562

563
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
564
                return gpuv1.NotReady, err
×
565
        }
×
566

567
        if err := n.client.Create(ctx, obj); err != nil {
1✔
568
                if !apierrors.IsAlreadyExists(err) {
×
569
                        logger.Info("Couldn't create", "Error", err)
×
570
                        return gpuv1.NotReady, err
×
571
                }
×
572

573
                logger.Info("Found Resource, updating...")
×
574
                err = n.client.Update(ctx, obj)
×
575
                if err != nil {
×
576
                        logger.Info("Couldn't update", "Error", err)
×
577
                        return gpuv1.NotReady, err
×
578
                }
×
579
        }
580

581
        return gpuv1.Ready, nil
1✔
582
}
583

584
// ConfigMaps creates ConfigMap resource(s)
585
func ConfigMaps(n ClusterPolicyController) (gpuv1.State, error) {
1✔
586
        status := gpuv1.Ready
1✔
587
        state := n.idx
1✔
588
        for i := range n.resources[state].ConfigMaps {
2✔
589
                stat, err := createConfigMap(n, i)
1✔
590
                if err != nil {
1✔
591
                        return stat, err
×
592
                }
×
593
                if stat != gpuv1.Ready {
1✔
594
                        status = gpuv1.NotReady
×
595
                }
×
596
        }
597
        return status, nil
1✔
598
}
599

600
// getKernelVersionsMap returns a map of kernel versions to their corresponding OS from all GPU nodes in the cluster
601
func (n ClusterPolicyController) getKernelVersionsMap() (map[string]string, error) {
1✔
602
        kernelVersionMap := make(map[string]string)
1✔
603
        ctx := n.ctx
1✔
604
        logger := n.logger.WithValues("Request.Namespace", "default", "Request.Name", "Node")
1✔
605

1✔
606
        // Filter only GPU nodes
1✔
607
        opts := []client.ListOption{
1✔
608
                client.MatchingLabels{"nvidia.com/gpu.present": "true"},
1✔
609
        }
1✔
610

1✔
611
        list := &corev1.NodeList{}
1✔
612
        err := n.client.List(ctx, list, opts...)
1✔
613
        if err != nil {
1✔
614
                logger.Info("Could not get NodeList", "ERROR", err)
×
615
                return nil, err
×
616
        }
×
617

618
        if len(list.Items) == 0 {
1✔
619
                // none of the nodes matched nvidia GPU label
×
620
                // either the nodes do not have GPUs, or NFD is not running
×
621
                logger.Info("Could not get any nodes to match nvidia.com/gpu.present label")
×
622
                return nil, nil
×
623
        }
×
624

625
        for _, node := range list.Items {
2✔
626
                labels := node.GetLabels()
1✔
627
                if kernelVersion, ok := labels[nfdKernelLabelKey]; ok {
2✔
628
                        logger.Info("Found kernel version label", "version", kernelVersion)
1✔
629
                        // get OS version for this kernel
1✔
630
                        osType := labels[nfdOSReleaseIDLabelKey]
1✔
631
                        osVersion := labels[nfdOSVersionIDLabelKey]
1✔
632
                        nodeOS := fmt.Sprintf("%s%s", osType, osVersion)
1✔
633
                        if os, ok := kernelVersionMap[kernelVersion]; ok {
1✔
634
                                if os != nodeOS {
×
635
                                        return nil, fmt.Errorf("different OS versions found for the same kernel version %s, unsupported configuration", kernelVersion)
×
636
                                }
×
637
                        }
638
                        // add mapping for "kernelVersion" --> "OS"
639
                        kernelVersionMap[kernelVersion] = nodeOS
1✔
640
                } else {
×
641
                        err := apierrors.NewNotFound(schema.GroupResource{Group: "Node", Resource: "Label"}, nfdKernelLabelKey)
×
642
                        logger.Error(err, "Failed to get kernel version of GPU node using Node Feature Discovery (NFD) labels. Is NFD installed in the cluster?")
×
643
                        return nil, err
×
644
                }
×
645
        }
646

647
        return kernelVersionMap, nil
1✔
648
}
649

650
func kernelFullVersion(n ClusterPolicyController) (string, string, string) {
1✔
651
        ctx := n.ctx
1✔
652
        logger := n.logger.WithValues("Request.Namespace", "default", "Request.Name", "Node")
1✔
653
        // We need the node labels to fetch the correct container
1✔
654
        opts := []client.ListOption{
1✔
655
                client.MatchingLabels{"nvidia.com/gpu.present": "true"},
1✔
656
        }
1✔
657

1✔
658
        list := &corev1.NodeList{}
1✔
659
        err := n.client.List(ctx, list, opts...)
1✔
660
        if err != nil {
1✔
661
                logger.Info("Could not get NodeList", "ERROR", err)
×
662
                return "", "", ""
×
663
        }
×
664

665
        if len(list.Items) == 0 {
1✔
666
                // none of the nodes matched nvidia GPU label
×
667
                // either the nodes do not have GPUs, or NFD is not running
×
668
                logger.Info("Could not get any nodes to match nvidia.com/gpu.present label", "ERROR", "")
×
669
                return "", "", ""
×
670
        }
×
671

672
        // Assuming all nodes are running the same kernel version,
673
        // One could easily add driver-kernel-versions for each node.
674
        node := list.Items[0]
1✔
675
        labels := node.GetLabels()
1✔
676

1✔
677
        var ok bool
1✔
678
        kFVersion, ok := labels[nfdKernelLabelKey]
1✔
679
        if ok {
2✔
680
                logger.Info(kFVersion)
1✔
681
        } else {
1✔
682
                err := apierrors.NewNotFound(schema.GroupResource{Group: "Node", Resource: "Label"}, nfdKernelLabelKey)
×
683
                logger.Info("Couldn't get kernelVersion, did you run the node feature discovery?", "Error", err)
×
684
                return "", "", ""
×
685
        }
×
686

687
        osName, ok := labels[nfdOSReleaseIDLabelKey]
1✔
688
        if !ok {
2✔
689
                return kFVersion, "", ""
1✔
690
        }
1✔
691
        osVersion, ok := labels[nfdOSVersionIDLabelKey]
1✔
692
        if !ok {
1✔
693
                return kFVersion, "", ""
×
694
        }
×
695
        osTag := fmt.Sprintf("%s%s", osName, osVersion)
1✔
696

1✔
697
        return kFVersion, osTag, osVersion
1✔
698
}
699

700
func preprocessService(obj *corev1.Service, n ClusterPolicyController) error {
1✔
701
        logger := n.logger.WithValues("Service", obj.Name)
1✔
702
        transformations := map[string]func(*corev1.Service, *gpuv1.ClusterPolicySpec) error{
1✔
703
                "nvidia-dcgm-exporter": TransformDCGMExporterService,
1✔
704
        }
1✔
705

1✔
706
        t, ok := transformations[obj.Name]
1✔
707
        if !ok {
1✔
708
                logger.V(2).Info(fmt.Sprintf("No transformation for Service '%s'", obj.Name))
×
709
                return nil
×
710
        }
×
711

712
        err := t(obj, &n.singleton.Spec)
1✔
713
        if err != nil {
1✔
714
                logger.Error(err, "Failed to apply transformation", "Service", obj.Name)
×
715
                return err
×
716
        }
×
717

718
        return nil
1✔
719
}
720

721
func preProcessDaemonSet(obj *appsv1.DaemonSet, n ClusterPolicyController) error {
1✔
722
        logger := n.logger.WithValues("Daemonset", obj.Name)
1✔
723
        transformations := map[string]func(*appsv1.DaemonSet, *gpuv1.ClusterPolicySpec, ClusterPolicyController) error{
1✔
724
                "nvidia-driver-daemonset":                 TransformDriver,
1✔
725
                "nvidia-vgpu-manager-daemonset":           TransformVGPUManager,
1✔
726
                "nvidia-vgpu-device-manager":              TransformVGPUDeviceManager,
1✔
727
                "nvidia-vfio-manager":                     TransformVFIOManager,
1✔
728
                "nvidia-container-toolkit-daemonset":      TransformToolkit,
1✔
729
                "nvidia-device-plugin-daemonset":          TransformDevicePlugin,
1✔
730
                "nvidia-device-plugin-mps-control-daemon": TransformMPSControlDaemon,
1✔
731
                "nvidia-sandbox-device-plugin-daemonset":  TransformSandboxDevicePlugin,
1✔
732
                "nvidia-dcgm":                             TransformDCGM,
1✔
733
                "nvidia-dcgm-exporter":                    TransformDCGMExporter,
1✔
734
                "nvidia-node-status-exporter":             TransformNodeStatusExporter,
1✔
735
                "gpu-feature-discovery":                   TransformGPUDiscoveryPlugin,
1✔
736
                "nvidia-mig-manager":                      TransformMIGManager,
1✔
737
                "nvidia-operator-validator":               TransformValidator,
1✔
738
                "nvidia-sandbox-validator":                TransformSandboxValidator,
1✔
739
                "nvidia-kata-manager":                     TransformKataManager,
1✔
740
                "nvidia-cc-manager":                       TransformCCManager,
1✔
741
        }
1✔
742

1✔
743
        t, ok := transformations[obj.Name]
1✔
744
        if !ok {
1✔
745
                logger.Info(fmt.Sprintf("No transformation for Daemonset '%s'", obj.Name))
×
746
                return nil
×
747
        }
×
748

749
        // apply common Daemonset configuration that is applicable to all
750
        err := applyCommonDaemonsetConfig(obj, &n.singleton.Spec)
1✔
751
        if err != nil {
1✔
752
                logger.Error(err, "Failed to apply common Daemonset transformation", "resource", obj.Name)
×
753
                return err
×
754
        }
×
755

756
        // transform the host-root and host-dev-char volumes if a custom host root is configured with the operator
757
        transformForHostRoot(obj, n.singleton.Spec.HostPaths.RootFS)
1✔
758

1✔
759
        // transform the driver-root volume if a custom driver install dir is configured with the operator
1✔
760
        transformForDriverInstallDir(obj, n.singleton.Spec.HostPaths.DriverInstallDir)
1✔
761

1✔
762
        // apply per operand Daemonset config
1✔
763
        err = t(obj, &n.singleton.Spec, n)
1✔
764
        if err != nil {
1✔
765
                logger.Error(err, "Failed to apply transformation", "resource", obj.Name)
×
766
                return err
×
767
        }
×
768

769
        // apply custom Labels and Annotations to the podSpec if any
770
        applyCommonDaemonsetMetadata(obj, &n.singleton.Spec.Daemonsets)
1✔
771

1✔
772
        return nil
1✔
773
}
774

775
// applyCommonDaemonsetMetadata adds additional labels and annotations to the daemonset podSpec if there are any specified
776
// by the user in the podSpec
777
func applyCommonDaemonsetMetadata(obj *appsv1.DaemonSet, dsSpec *gpuv1.DaemonsetsSpec) {
1✔
778
        if len(dsSpec.Labels) > 0 {
2✔
779
                if obj.Spec.Template.Labels == nil {
2✔
780
                        obj.Spec.Template.Labels = make(map[string]string)
1✔
781
                }
1✔
782
                for labelKey, labelValue := range dsSpec.Labels {
2✔
783
                        // if the user specifies an override of the "app" or the ""app.kubernetes.io/part-of"" key, we skip it.
1✔
784
                        // DaemonSet pod selectors are immutable, so we still want the pods to be selectable as before and working
1✔
785
                        // with the existing daemon set selectors.
1✔
786
                        if labelKey == "app" || labelKey == "app.kubernetes.io/part-of" {
2✔
787
                                continue
1✔
788
                        }
789
                        obj.Spec.Template.Labels[labelKey] = labelValue
1✔
790
                }
791
        }
792

793
        if len(dsSpec.Annotations) > 0 {
2✔
794
                if obj.Spec.Template.Annotations == nil {
2✔
795
                        obj.Spec.Template.Annotations = make(map[string]string)
1✔
796
                }
1✔
797
                for annoKey, annoVal := range dsSpec.Annotations {
2✔
798
                        obj.Spec.Template.Annotations[annoKey] = annoVal
1✔
799
                }
1✔
800
        }
801
}
802

803
// Apply common config that is applicable for all Daemonsets
804
func applyCommonDaemonsetConfig(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) error {
1✔
805
        // apply daemonset update strategy
1✔
806
        err := applyUpdateStrategyConfig(obj, config)
1✔
807
        if err != nil {
2✔
808
                return err
1✔
809
        }
1✔
810

811
        // update PriorityClass
812
        if config.Daemonsets.PriorityClassName != "" {
2✔
813
                obj.Spec.Template.Spec.PriorityClassName = config.Daemonsets.PriorityClassName
1✔
814
        }
1✔
815

816
        // set tolerations if specified
817
        if len(config.Daemonsets.Tolerations) > 0 {
2✔
818
                obj.Spec.Template.Spec.Tolerations = config.Daemonsets.Tolerations
1✔
819
        }
1✔
820
        return nil
1✔
821
}
822

823
// apply necessary transforms if a custom host root path is configured
824
func transformForHostRoot(obj *appsv1.DaemonSet, hostRoot string) {
1✔
825
        if hostRoot == "" || hostRoot == "/" {
2✔
826
                return
1✔
827
        }
1✔
828

829
        transformHostRootVolume(obj, hostRoot)
1✔
830
        transformHostDevCharVolume(obj, hostRoot)
1✔
831
}
832

833
func transformHostRootVolume(obj *appsv1.DaemonSet, hostRoot string) {
1✔
834
        containsHostRootVolume := false
1✔
835
        for _, volume := range obj.Spec.Template.Spec.Volumes {
2✔
836
                if volume.Name == "host-root" {
2✔
837
                        volume.HostPath.Path = hostRoot
1✔
838
                        containsHostRootVolume = true
1✔
839
                        break
1✔
840
                }
841
        }
842

843
        if !containsHostRootVolume {
2✔
844
                return
1✔
845
        }
1✔
846

847
        for index := range obj.Spec.Template.Spec.InitContainers {
1✔
848
                setContainerEnv(&(obj.Spec.Template.Spec.InitContainers[index]), HostRootEnvName, hostRoot)
×
849
        }
×
850

851
        for index := range obj.Spec.Template.Spec.Containers {
2✔
852
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[index]), HostRootEnvName, hostRoot)
1✔
853
        }
1✔
854
}
855

856
func transformHostDevCharVolume(obj *appsv1.DaemonSet, hostRoot string) {
1✔
857
        for _, volume := range obj.Spec.Template.Spec.Volumes {
2✔
858
                if volume.Name == "host-dev-char" {
2✔
859
                        volume.HostPath.Path = filepath.Join(hostRoot, "/dev/char")
1✔
860
                        break
1✔
861
                }
862
        }
863
}
864

865
// apply necessary transforms if a custom driver install directory is configured
866
func transformForDriverInstallDir(obj *appsv1.DaemonSet, driverInstallDir string) {
1✔
867
        if driverInstallDir == "" || driverInstallDir == DefaultDriverInstallDir {
2✔
868
                return
1✔
869
        }
1✔
870

871
        containsDriverInstallDirVolume := false
1✔
872
        podSpec := obj.Spec.Template.Spec
1✔
873
        for _, volume := range podSpec.Volumes {
2✔
874
                if volume.Name == "driver-install-dir" {
2✔
875
                        volume.HostPath.Path = driverInstallDir
1✔
876
                        containsDriverInstallDirVolume = true
1✔
877
                        break
1✔
878
                }
879
        }
880

881
        if !containsDriverInstallDirVolume {
2✔
882
                return
1✔
883
        }
1✔
884

885
        if ctr := findContainerByName(podSpec.InitContainers, "driver-validation"); ctr != nil {
2✔
886
                setContainerEnv(ctr, DriverInstallDirEnvName, driverInstallDir)
1✔
887
                setContainerEnv(ctr, DriverInstallDirCtrPathEnvName, driverInstallDir)
1✔
888
                for i, volumeMount := range ctr.VolumeMounts {
2✔
889
                        if volumeMount.Name == "driver-install-dir" {
2✔
890
                                ctr.VolumeMounts[i].MountPath = driverInstallDir
1✔
891
                        }
1✔
892
                }
893
        }
894
}
895

896
// TransformGPUDiscoveryPlugin transforms GPU discovery daemonset with required config as per ClusterPolicy
897
func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
×
898
        // update validation container
×
899
        err := transformValidationInitContainer(obj, config)
×
900
        if err != nil {
×
901
                return err
×
902
        }
×
903

904
        // update image
905
        img, err := gpuv1.ImagePath(&config.GPUFeatureDiscovery)
×
906
        if err != nil {
×
907
                return err
×
908
        }
×
909
        obj.Spec.Template.Spec.Containers[0].Image = img
×
910

×
911
        // update image pull policy
×
912
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.GPUFeatureDiscovery.ImagePullPolicy)
×
913

×
914
        // set image pull secrets
×
915
        if len(config.GPUFeatureDiscovery.ImagePullSecrets) > 0 {
×
916
                addPullSecrets(&obj.Spec.Template.Spec, config.GPUFeatureDiscovery.ImagePullSecrets)
×
917
        }
×
918

919
        // set resource limits
920
        if config.GPUFeatureDiscovery.Resources != nil {
×
921
                // apply resource limits to all containers
×
922
                for i := range obj.Spec.Template.Spec.Containers {
×
923
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.GPUFeatureDiscovery.Resources.Requests
×
924
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.GPUFeatureDiscovery.Resources.Limits
×
925
                }
×
926
        }
927

928
        // set arguments if specified for driver container
929
        if len(config.GPUFeatureDiscovery.Args) > 0 {
×
930
                obj.Spec.Template.Spec.Containers[0].Args = config.GPUFeatureDiscovery.Args
×
931
        }
×
932

933
        // set/append environment variables for exporter container
934
        if len(config.GPUFeatureDiscovery.Env) > 0 {
×
935
                for _, env := range config.GPUFeatureDiscovery.Env {
×
936
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
×
937
                }
×
938
        }
939

940
        // apply plugin configuration through ConfigMap if one is provided
941
        err = handleDevicePluginConfig(obj, config)
×
942
        if err != nil {
×
943
                return err
×
944
        }
×
945

946
        setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
×
947

×
948
        // update env required for MIG support
×
949
        applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy)
×
950

×
951
        return nil
×
952
}
953

954
// parseOSRelease can be overridden in tests for mocking filesystem access.
955
// In production, it reads and parses /host-etc/os-release.
956
var parseOSRelease = parseOSReleaseFromFile
957

958
// osReleaseFilePath is the path to the os-release file, configurable for testing.
959
var osReleaseFilePath = "/host-etc/os-release"
960

961
// parseOSReleaseFromFile reads and parses the os-release file from the host filesystem.
962
func parseOSReleaseFromFile() (map[string]string, error) {
1✔
963
        release := map[string]string{}
1✔
964

1✔
965
        f, err := os.Open(osReleaseFilePath)
1✔
966
        if err != nil {
2✔
967
                return nil, err
1✔
968
        }
1✔
969
        defer f.Close()
1✔
970

1✔
971
        re := regexp.MustCompile(`^(?P<key>\w+)=(?P<value>.+)`)
1✔
972

1✔
973
        // Read line-by-line
1✔
974
        s := bufio.NewScanner(f)
1✔
975
        for s.Scan() {
2✔
976
                line := s.Text()
1✔
977
                if m := re.FindStringSubmatch(line); m != nil {
2✔
978
                        release[m[1]] = strings.Trim(m[2], `"`)
1✔
979
                }
1✔
980
        }
981
        return release, nil
1✔
982
}
983

984
func TransformDCGMExporterService(obj *corev1.Service, config *gpuv1.ClusterPolicySpec) error {
1✔
985
        serviceConfig := config.DCGMExporter.ServiceSpec
1✔
986
        if serviceConfig != nil {
2✔
987
                if len(serviceConfig.Type) > 0 {
2✔
988
                        obj.Spec.Type = serviceConfig.Type
1✔
989
                }
1✔
990

991
                if serviceConfig.InternalTrafficPolicy != nil {
2✔
992
                        obj.Spec.InternalTrafficPolicy = serviceConfig.InternalTrafficPolicy
1✔
993
                }
1✔
994
        }
995
        return nil
1✔
996
}
997

998
// TransformDriver transforms Nvidia driver daemonset with required config as per ClusterPolicy
999
func TransformDriver(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1000
        // update validation container
1✔
1001
        err := transformValidationInitContainer(obj, config)
1✔
1002
        if err != nil {
1✔
1003
                return err
×
1004
        }
×
1005

1006
        // update driver-manager initContainer
1007
        err = transformDriverManagerInitContainer(obj, &config.Driver.Manager, config.Driver.GPUDirectRDMA)
1✔
1008
        if err != nil {
1✔
1009
                return err
×
1010
        }
×
1011

1012
        // update nvidia-driver container
1013
        err = transformDriverContainer(obj, config, n)
1✔
1014
        if err != nil {
1✔
1015
                return err
×
1016
        }
×
1017

1018
        // update nvidia-peermem sidecar container
1019
        err = transformPeerMemoryContainer(obj, config, n)
1✔
1020
        if err != nil {
1✔
1021
                return err
×
1022
        }
×
1023

1024
        // update nvidia-fs sidecar container
1025
        err = transformGDSContainer(obj, config, n)
1✔
1026
        if err != nil {
1✔
1027
                return err
×
1028
        }
×
1029

1030
        // updated nvidia-gdrcopy sidecar container
1031
        err = transformGDRCopyContainer(obj, config, n)
1✔
1032
        if err != nil {
1✔
1033
                return err
×
1034
        }
×
1035

1036
        // update/remove OpenShift Driver Toolkit sidecar container
1037
        err = transformOpenShiftDriverToolkitContainer(obj, config, n, "nvidia-driver-ctr")
1✔
1038
        if err != nil {
1✔
1039
                return fmt.Errorf("ERROR: failed to transform the Driver Toolkit Container: %s", err)
×
1040
        }
×
1041

1042
        // updates for per kernel version pods using pre-compiled drivers
1043
        if config.Driver.UsePrecompiledDrivers() {
2✔
1044
                err = transformPrecompiledDriverDaemonset(obj, n)
1✔
1045
                if err != nil {
1✔
1046
                        return fmt.Errorf("ERROR: failed to transform the pre-compiled Driver Daemonset: %s", err)
×
1047
                }
×
1048
        }
1049

1050
        // Compute driver configuration digest after all transformations are complete
1051
        configDigest := utils.GetObjectHash(obj.Spec)
1✔
1052

1✔
1053
        // Set the computed digest in driver-manager initContainer
1✔
1054
        driverManagerContainer := findContainerByName(obj.Spec.Template.Spec.InitContainers, "k8s-driver-manager")
1✔
1055
        setContainerEnv(driverManagerContainer, "DRIVER_CONFIG_DIGEST", configDigest)
1✔
1056

1✔
1057
        // Set the computed digest in nvidia-driver container
1✔
1058
        driverContainer := findContainerByName(obj.Spec.Template.Spec.Containers, "nvidia-driver-ctr")
1✔
1059
        setContainerEnv(driverContainer, "DRIVER_CONFIG_DIGEST", configDigest)
1✔
1060

1✔
1061
        return nil
1✔
1062
}
1063

1064
// TransformVGPUManager transforms NVIDIA vGPU Manager daemonset with required config as per ClusterPolicy
1065
func TransformVGPUManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1066
        // update k8s-driver-manager initContainer
1✔
1067
        err := transformDriverManagerInitContainer(obj, &config.VGPUManager.DriverManager, nil)
1✔
1068
        if err != nil {
1✔
1069
                return fmt.Errorf("failed to transform k8s-driver-manager initContainer for vGPU Manager: %v", err)
×
1070
        }
×
1071

1072
        // update nvidia-vgpu-manager container
1073
        err = transformVGPUManagerContainer(obj, config, n)
1✔
1074
        if err != nil {
1✔
1075
                return fmt.Errorf("failed to transform vGPU Manager container: %v", err)
×
1076
        }
×
1077

1078
        // update OpenShift Driver Toolkit sidecar container
1079
        err = transformOpenShiftDriverToolkitContainer(obj, config, n, "nvidia-vgpu-manager-ctr")
1✔
1080
        if err != nil {
1✔
1081
                return fmt.Errorf("failed to transform the Driver Toolkit container: %s", err)
×
1082
        }
×
1083

1084
        // Compute configuration digest after all transformations are complete
1085
        digest := utils.GetObjectHash(obj.Spec)
1✔
1086

1✔
1087
        driverManagerContainer := findContainerByName(obj.Spec.Template.Spec.InitContainers, "k8s-driver-manager")
1✔
1088
        setContainerEnv(driverManagerContainer, "DRIVER_CONFIG_DIGEST", digest)
1✔
1089

1✔
1090
        return nil
1✔
1091
}
1092

1093
// applyOCPProxySpec applies proxy settings to podSpec
1094
func applyOCPProxySpec(n ClusterPolicyController, podSpec *corev1.PodSpec) error {
×
1095
        // Pass HTTPS_PROXY, HTTP_PROXY and NO_PROXY env if set in clusterwide proxy for OCP
×
1096
        proxy, err := GetClusterWideProxy(n.ctx)
×
1097
        if err != nil {
×
1098
                return fmt.Errorf("ERROR: failed to get clusterwide proxy object: %s", err)
×
1099
        }
×
1100

1101
        if proxy == nil {
×
1102
                // no clusterwide proxy configured
×
1103
                return nil
×
1104
        }
×
1105

1106
        for i, container := range podSpec.Containers {
×
1107
                // skip if not nvidia-driver container
×
1108
                if !strings.Contains(container.Name, "nvidia-driver") {
×
1109
                        continue
×
1110
                }
1111

1112
                proxyEnv := getProxyEnv(proxy)
×
1113
                if len(proxyEnv) != 0 {
×
1114
                        podSpec.Containers[i].Env = append(podSpec.Containers[i].Env, proxyEnv...)
×
1115
                }
×
1116

1117
                // if user-ca-bundle is setup in proxy,  create a trusted-ca configmap and add volume mount
1118
                if proxy.Spec.TrustedCA.Name == "" {
×
1119
                        return nil
×
1120
                }
×
1121

1122
                // create trusted-ca configmap to inject custom user ca bundle into it
1123
                _, err = getOrCreateTrustedCAConfigMap(n, TrustedCAConfigMapName)
×
1124
                if err != nil {
×
1125
                        return err
×
1126
                }
×
1127

1128
                // mount trusted-ca configmap
1129
                podSpec.Containers[i].VolumeMounts = append(podSpec.Containers[i].VolumeMounts,
×
1130
                        corev1.VolumeMount{
×
1131
                                Name:      TrustedCAConfigMapName,
×
1132
                                ReadOnly:  true,
×
1133
                                MountPath: TrustedCABundleMountDir,
×
1134
                        })
×
1135
                podSpec.Volumes = append(podSpec.Volumes,
×
1136
                        corev1.Volume{
×
1137
                                Name: TrustedCAConfigMapName,
×
1138
                                VolumeSource: corev1.VolumeSource{
×
1139
                                        ConfigMap: &corev1.ConfigMapVolumeSource{
×
1140
                                                LocalObjectReference: corev1.LocalObjectReference{
×
1141
                                                        Name: TrustedCAConfigMapName,
×
1142
                                                },
×
1143
                                                Items: []corev1.KeyToPath{
×
1144
                                                        {
×
1145
                                                                Key:  TrustedCABundleFileName,
×
1146
                                                                Path: TrustedCACertificate,
×
1147
                                                        },
×
1148
                                                },
×
1149
                                        },
×
1150
                                },
×
1151
                        })
×
1152
        }
1153
        return nil
×
1154
}
1155

1156
// getOrCreateTrustedCAConfigMap creates or returns an existing Trusted CA Bundle ConfigMap.
1157
func getOrCreateTrustedCAConfigMap(n ClusterPolicyController, name string) (*corev1.ConfigMap, error) {
×
1158
        ctx := n.ctx
×
1159
        configMap := &corev1.ConfigMap{
×
1160
                TypeMeta: metav1.TypeMeta{
×
1161
                        Kind:       "ConfigMap",
×
1162
                        APIVersion: corev1.SchemeGroupVersion.String(),
×
1163
                },
×
1164
                ObjectMeta: metav1.ObjectMeta{
×
1165
                        Name:      name,
×
1166
                        Namespace: n.operatorNamespace,
×
1167
                },
×
1168
                Data: map[string]string{
×
1169
                        TrustedCABundleFileName: "",
×
1170
                },
×
1171
        }
×
1172

×
1173
        // apply label "config.openshift.io/inject-trusted-cabundle: true", so that cert is automatically filled/updated.
×
1174
        configMap.Labels = make(map[string]string)
×
1175
        configMap.Labels["config.openshift.io/inject-trusted-cabundle"] = "true"
×
1176

×
1177
        logger := n.logger.WithValues("ConfigMap", configMap.Name, "Namespace", configMap.Namespace)
×
1178

×
1179
        if err := controllerutil.SetControllerReference(n.singleton, configMap, n.scheme); err != nil {
×
1180
                return nil, err
×
1181
        }
×
1182

1183
        found := &corev1.ConfigMap{}
×
1184
        err := n.client.Get(ctx, types.NamespacedName{Namespace: configMap.Namespace, Name: configMap.Name}, found)
×
1185
        if err != nil && apierrors.IsNotFound(err) {
×
1186
                logger.Info("Not found, creating")
×
1187
                err = n.client.Create(ctx, configMap)
×
1188
                if err != nil {
×
1189
                        logger.Info("Couldn't create")
×
1190
                        return nil, fmt.Errorf("failed to create trusted CA bundle config map %q: %s", name, err)
×
1191
                }
×
1192
                return configMap, nil
×
1193
        } else if err != nil {
×
1194
                return nil, fmt.Errorf("failed to get trusted CA bundle config map %q: %s", name, err)
×
1195
        }
×
1196

1197
        return found, nil
×
1198
}
1199

1200
// get proxy env variables from cluster wide proxy in OCP
1201
func getProxyEnv(proxyConfig *apiconfigv1.Proxy) []corev1.EnvVar {
×
1202
        envVars := []corev1.EnvVar{}
×
1203
        if proxyConfig == nil {
×
1204
                return envVars
×
1205
        }
×
1206
        proxies := map[string]string{
×
1207
                "HTTPS_PROXY": proxyConfig.Spec.HTTPSProxy,
×
1208
                "HTTP_PROXY":  proxyConfig.Spec.HTTPProxy,
×
1209
                "NO_PROXY":    proxyConfig.Spec.NoProxy,
×
1210
        }
×
1211
        var envs []string
×
1212
        for k := range proxies {
×
1213
                envs = append(envs, k)
×
1214
        }
×
1215
        // ensure ordering is preserved when we add these env to pod spec
1216
        sort.Strings(envs)
×
1217

×
1218
        for _, e := range envs {
×
1219
                v := proxies[e]
×
1220
                if len(v) == 0 {
×
1221
                        continue
×
1222
                }
1223
                upperCaseEnvvar := corev1.EnvVar{
×
1224
                        Name:  strings.ToUpper(e),
×
1225
                        Value: v,
×
1226
                }
×
1227
                lowerCaseEnvvar := corev1.EnvVar{
×
1228
                        Name:  strings.ToLower(e),
×
1229
                        Value: v,
×
1230
                }
×
1231
                envVars = append(envVars, upperCaseEnvvar, lowerCaseEnvvar)
×
1232
        }
1233

1234
        return envVars
×
1235
}
1236

1237
func transformToolkitCtrForCDI(container *corev1.Container) {
1✔
1238
        // When CDI is enabled in GPU Operator, we leverage native CDI support in containerd / cri-o
1✔
1239
        // to inject GPUs into workloads. We do not configure 'nvidia' as the default runtime. The
1✔
1240
        // 'nvidia' runtime will be set as the runtime class for our management containers so that
1✔
1241
        // they get access to all GPUs.
1✔
1242
        //
1✔
1243
        // Note: one could override this and continue to configure 'nvidia' as the default runtime
1✔
1244
        // by directly setting the 'NVIDIA_RUNTIME_SET_AS_DEFAULT' environment variable to 'true' in
1✔
1245
        // the toolkit container. One can leverage the 'toolkit.env' field in ClusterPolicy to
1✔
1246
        // directly configure environment variables for the toolkit container.
1✔
1247
        setContainerEnv(container, CDIEnabledEnvName, "true")
1✔
1248
        setContainerEnv(container, NvidiaRuntimeSetAsDefaultEnvName, "false")
1✔
1249
        setContainerEnv(container, NvidiaCtrRuntimeModeEnvName, "cdi")
1✔
1250
        setContainerEnv(container, CRIOConfigModeEnvName, "config")
1✔
1251
}
1✔
1252

1253
// TransformToolkit transforms Nvidia container-toolkit daemonset with required config as per ClusterPolicy
1254
func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1255
        toolkitContainerName := "nvidia-container-toolkit-ctr"
1✔
1256
        toolkitMainContainer := findContainerByName(obj.Spec.Template.Spec.Containers, toolkitContainerName)
1✔
1257
        if toolkitMainContainer == nil {
1✔
1258
                return fmt.Errorf("failed to find toolkit container %q", toolkitContainerName)
×
1259
        }
×
1260

1261
        // update validation container
1262
        err := transformValidationInitContainer(obj, config)
1✔
1263
        if err != nil {
1✔
1264
                return err
×
1265
        }
×
1266
        // update image
1267
        image, err := gpuv1.ImagePath(&config.Toolkit)
1✔
1268
        if err != nil {
1✔
1269
                return err
×
1270
        }
×
1271
        toolkitMainContainer.Image = image
1✔
1272

1✔
1273
        // update image pull policy
1✔
1274
        toolkitMainContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.Toolkit.ImagePullPolicy)
1✔
1275

1✔
1276
        // set image pull secrets
1✔
1277
        if len(config.Toolkit.ImagePullSecrets) > 0 {
2✔
1278
                addPullSecrets(&obj.Spec.Template.Spec, config.Toolkit.ImagePullSecrets)
1✔
1279
        }
1✔
1280

1281
        // set resource limits
1282
        if config.Toolkit.Resources != nil {
2✔
1283
                // apply resource limits to all containers
1✔
1284
                for i := range obj.Spec.Template.Spec.Containers {
2✔
1285
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.Toolkit.Resources.Requests
1✔
1286
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.Toolkit.Resources.Limits
1✔
1287
                }
1✔
1288
        }
1289

1290
        // update env required for CDI support
1291
        if config.CDI.IsEnabled() {
2✔
1292
                transformToolkitCtrForCDI(toolkitMainContainer)
1✔
1293
        } else if n.runtime == gpuv1.CRIO {
3✔
1294
                // (cdesiniotis) When CDI is not enabled and cri-o is the container runtime,
1✔
1295
                // we continue to install the OCI prestart hook as opposed to adding nvidia
1✔
1296
                // runtime handlers to the cri-o configuration. Users can override this behavior
1✔
1297
                // and have nvidia runtime handlers added to the cri-o configuration by setting
1✔
1298
                // the 'CRIO_CONFIG_MODE' environment variable to 'config' in the toolkit container.
1✔
1299
                // However, one should note setting 'CRIO_CONFIG_MODE' to 'config' in this case
1✔
1300
                // (when CDI is not enabled) would result in the 'nvidia' runtime being set as
1✔
1301
                // the default runtime. While this should work in theory, it is a significant
1✔
1302
                // change -- which was the primary motivation to continue using the OCI prestart
1✔
1303
                // hook by default in this case.
1✔
1304
                setContainerEnv(toolkitMainContainer, CRIOConfigModeEnvName, "hook")
1✔
1305
        }
1✔
1306

1307
        // set install directory for the toolkit
1308
        if config.Toolkit.InstallDir != "" && config.Toolkit.InstallDir != DefaultToolkitInstallDir {
1✔
1309
                setContainerEnv(toolkitMainContainer, ToolkitInstallDirEnvName, config.Toolkit.InstallDir)
×
1310

×
1311
                for i, volume := range obj.Spec.Template.Spec.Volumes {
×
1312
                        if volume.Name == "toolkit-install-dir" {
×
1313
                                obj.Spec.Template.Spec.Volumes[i].HostPath.Path = config.Toolkit.InstallDir
×
1314
                                break
×
1315
                        }
1316
                }
1317

1318
                for i, volumeMount := range toolkitMainContainer.VolumeMounts {
×
1319
                        if volumeMount.Name == "toolkit-install-dir" {
×
1320
                                toolkitMainContainer.VolumeMounts[i].MountPath = config.Toolkit.InstallDir
×
1321
                                break
×
1322
                        }
1323
                }
1324
        }
1325

1326
        // Update CRI-O hooks path to use default path for non OCP cases
1327
        if n.openshift == "" && n.runtime == gpuv1.CRIO {
2✔
1328
                for index, volume := range obj.Spec.Template.Spec.Volumes {
1✔
1329
                        if volume.Name == "crio-hooks" {
×
1330
                                obj.Spec.Template.Spec.Volumes[index].HostPath.Path = "/usr/share/containers/oci/hooks.d"
×
1331
                        }
×
1332
                }
1333
        }
1334

1335
        if len(config.Toolkit.Env) > 0 {
2✔
1336
                for _, env := range config.Toolkit.Env {
2✔
1337
                        setContainerEnv(toolkitMainContainer, env.Name, env.Value)
1✔
1338
                }
1✔
1339
        }
1340

1341
        // configure runtime
1342
        runtime := n.runtime.String()
1✔
1343
        err = transformForRuntime(obj, config, runtime, toolkitMainContainer)
1✔
1344
        if err != nil {
1✔
1345
                return fmt.Errorf("error transforming toolkit daemonset : %w", err)
×
1346
        }
×
1347

1348
        return nil
1✔
1349
}
1350

1351
func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, runtime string, container *corev1.Container) error {
1✔
1352
        setContainerEnv(container, "RUNTIME", runtime)
1✔
1353

1✔
1354
        if runtime == gpuv1.Containerd.String() {
2✔
1355
                // Set the runtime class name that is to be configured for containerd
1✔
1356
                setContainerEnv(container, "CONTAINERD_RUNTIME_CLASS", getRuntimeClassName(config))
1✔
1357
        }
1✔
1358

1359
        // For runtime config files we have top-level configs and drop-in files.
1360
        // These are supported as follows:
1361
        //   * Docker only supports top-level config files.
1362
        //   * Containerd supports drop-in files, but required modification to the top-level config
1363
        //   * Crio supports drop-in files at a predefined location. The top-level config may be read
1364
        //     but should not be updated.
1365

1366
        // setup mounts for runtime config file
1367
        topLevelConfigFile, dropInConfigFile, err := getRuntimeConfigFiles(container, runtime)
1✔
1368
        if err != nil {
1✔
1369
                return fmt.Errorf("error getting path to runtime config file: %w", err)
×
1370
        }
×
1371

1372
        var configEnvvarName string
1✔
1373
        switch runtime {
1✔
1374
        case gpuv1.Containerd.String():
1✔
1375
                configEnvvarName = "CONTAINERD_CONFIG"
1✔
1376
        case gpuv1.Docker.String():
1✔
1377
                configEnvvarName = "DOCKER_CONFIG"
1✔
1378
        case gpuv1.CRIO.String():
1✔
1379
                configEnvvarName = "CRIO_CONFIG"
1✔
1380
        }
1381

1382
        // Handle the top-level configs
1383
        if topLevelConfigFile != "" {
2✔
1384
                sourceConfigFileName := path.Base(topLevelConfigFile)
1✔
1385
                sourceConfigDir := path.Dir(topLevelConfigFile)
1✔
1386
                containerConfigDir := DefaultRuntimeConfigTargetDir
1✔
1387
                setContainerEnv(container, "RUNTIME_CONFIG", containerConfigDir+sourceConfigFileName)
1✔
1388
                setContainerEnv(container, configEnvvarName, containerConfigDir+sourceConfigFileName)
1✔
1389

1✔
1390
                volMountConfigName := fmt.Sprintf("%s-config", runtime)
1✔
1391
                volMountConfig := corev1.VolumeMount{Name: volMountConfigName, MountPath: containerConfigDir}
1✔
1392
                container.VolumeMounts = append(container.VolumeMounts, volMountConfig)
1✔
1393

1✔
1394
                configVol := corev1.Volume{Name: volMountConfigName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: sourceConfigDir, Type: ptr.To(corev1.HostPathDirectoryOrCreate)}}}
1✔
1395
                obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, configVol)
1✔
1396
        }
1✔
1397

1398
        // Handle the drop-in configs
1399
        // TODO: It's a bit of a hack to skip the `nvidia-kata-manager` container here.
1400
        // Ideally if the two projects are using the SAME API then this should be
1401
        // captured more rigorously.
1402
        // Note that we probably want to implement drop-in file support in the
1403
        // kata manager in any case -- in which case it will be good to use a
1404
        // similar implementation.
1405
        if dropInConfigFile != "" && container.Name != "nvidia-kata-manager" {
2✔
1406
                sourceConfigFileName := path.Base(dropInConfigFile)
1✔
1407
                sourceConfigDir := path.Dir(dropInConfigFile)
1✔
1408
                containerConfigDir := DefaultRuntimeDropInConfigTargetDir
1✔
1409
                setContainerEnv(container, "RUNTIME_DROP_IN_CONFIG", containerConfigDir+sourceConfigFileName)
1✔
1410
                setContainerEnv(container, "RUNTIME_DROP_IN_CONFIG_HOST_PATH", dropInConfigFile)
1✔
1411

1✔
1412
                volMountConfigName := fmt.Sprintf("%s-drop-in-config", runtime)
1✔
1413
                volMountConfig := corev1.VolumeMount{Name: volMountConfigName, MountPath: containerConfigDir}
1✔
1414
                container.VolumeMounts = append(container.VolumeMounts, volMountConfig)
1✔
1415

1✔
1416
                configVol := corev1.Volume{Name: volMountConfigName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: sourceConfigDir, Type: ptr.To(corev1.HostPathDirectoryOrCreate)}}}
1✔
1417
                obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, configVol)
1✔
1418
        }
1✔
1419

1420
        // Handle any additional runtime config sources
1421
        const runtimeConfigSourceFile = "file"
1✔
1422
        if runtimeConfigSources := getContainerEnv(container, "RUNTIME_CONFIG_SOURCE"); runtimeConfigSources != "" {
2✔
1423
                var sources []string
1✔
1424
                for _, runtimeConfigSource := range strings.Split(runtimeConfigSources, ",") {
2✔
1425
                        parts := strings.SplitN(runtimeConfigSource, "=", 2)
1✔
1426
                        if len(parts) == 1 || parts[0] != runtimeConfigSourceFile {
2✔
1427
                                sources = append(sources, runtimeConfigSource)
1✔
1428
                                continue
1✔
1429
                        }
1430
                        // We transform the host path to a container path by prepending "/host" to the file
1431
                        // path. This works because the toolkit container has the host's root filesystem
1432
                        // mounted as read-only at "/host"
1433
                        sourceConfigFile := filepath.Join("/host", parts[1])
1✔
1434
                        sources = append(sources, runtimeConfigSourceFile+"="+sourceConfigFile)
1✔
1435
                }
1436
                setContainerEnv(container, "RUNTIME_CONFIG_SOURCE", strings.Join(sources, ","))
1✔
1437
        }
1438

1439
        // setup mounts for runtime socket file
1440
        runtimeSocketFile, err := getRuntimeSocketFile(container, runtime)
1✔
1441
        if err != nil {
1✔
1442
                return fmt.Errorf("error getting path to runtime socket: %w", err)
×
1443
        }
×
1444
        if runtimeSocketFile != "" {
2✔
1445
                sourceSocketFileName := path.Base(runtimeSocketFile)
1✔
1446
                // set envvar for runtime socket
1✔
1447
                var socketEnvvarName string
1✔
1448
                if runtime == gpuv1.Containerd.String() {
2✔
1449
                        socketEnvvarName = "CONTAINERD_SOCKET"
1✔
1450
                } else if runtime == gpuv1.Docker.String() {
3✔
1451
                        socketEnvvarName = "DOCKER_SOCKET"
1✔
1452
                }
1✔
1453
                setContainerEnv(container, "RUNTIME_SOCKET", DefaultRuntimeSocketTargetDir+sourceSocketFileName)
1✔
1454
                setContainerEnv(container, socketEnvvarName, DefaultRuntimeSocketTargetDir+sourceSocketFileName)
1✔
1455

1✔
1456
                volMountSocketName := fmt.Sprintf("%s-socket", runtime)
1✔
1457
                volMountSocket := corev1.VolumeMount{Name: volMountSocketName, MountPath: DefaultRuntimeSocketTargetDir}
1✔
1458
                container.VolumeMounts = append(container.VolumeMounts, volMountSocket)
1✔
1459

1✔
1460
                socketVol := corev1.Volume{Name: volMountSocketName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: path.Dir(runtimeSocketFile)}}}
1✔
1461
                obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, socketVol)
1✔
1462
        }
1463
        return nil
1✔
1464
}
1465

1466
func transformDevicePluginCtrForCDI(container *corev1.Container, config *gpuv1.ClusterPolicySpec) {
1✔
1467
        setContainerEnv(container, CDIEnabledEnvName, "true")
1✔
1468
        setContainerEnv(container, DeviceListStrategyEnvName, "cdi-annotations,cdi-cri")
1✔
1469
        setContainerEnv(container, CDIAnnotationPrefixEnvName, "cdi.k8s.io/")
1✔
1470

1✔
1471
        if config.Toolkit.IsEnabled() {
2✔
1472
                setContainerEnv(container, NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook"))
1✔
1473
        }
1✔
1474
}
1475

1476
// TransformDevicePlugin transforms k8s-device-plugin daemonset with required config as per ClusterPolicy
1477
func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1478
        devicePluginContainerName := "nvidia-device-plugin"
1✔
1479
        devicePluginMainContainer := findContainerByName(obj.Spec.Template.Spec.Containers, devicePluginContainerName)
1✔
1480
        if devicePluginMainContainer == nil {
1✔
1481
                return fmt.Errorf("failed to find device plugin container %q", devicePluginContainerName)
×
1482
        }
×
1483

1484
        // update validation container
1485
        err := transformValidationInitContainer(obj, config)
1✔
1486
        if err != nil {
1✔
1487
                return err
×
1488
        }
×
1489

1490
        // update image
1491
        image, err := gpuv1.ImagePath(&config.DevicePlugin)
1✔
1492
        if err != nil {
1✔
1493
                return err
×
1494
        }
×
1495
        devicePluginMainContainer.Image = image
1✔
1496

1✔
1497
        // update image pull policy
1✔
1498
        devicePluginMainContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.DevicePlugin.ImagePullPolicy)
1✔
1499

1✔
1500
        // set image pull secrets
1✔
1501
        if len(config.DevicePlugin.ImagePullSecrets) > 0 {
2✔
1502
                addPullSecrets(&obj.Spec.Template.Spec, config.DevicePlugin.ImagePullSecrets)
1✔
1503
        }
1✔
1504

1505
        // set resource limits
1506
        if config.DevicePlugin.Resources != nil {
1✔
1507
                // apply resource limits to all containers
×
1508
                for i := range obj.Spec.Template.Spec.Containers {
×
1509
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.DevicePlugin.Resources.Requests
×
1510
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.DevicePlugin.Resources.Limits
×
1511
                }
×
1512
        }
1513
        // set arguments if specified for device-plugin container
1514
        if len(config.DevicePlugin.Args) > 0 {
2✔
1515
                devicePluginMainContainer.Args = config.DevicePlugin.Args
1✔
1516
        }
1✔
1517

1518
        // add env to allow injection of /dev/nvidia-fs and /dev/infiniband devices for GDS
1519
        if config.GPUDirectStorage != nil && config.GPUDirectStorage.IsEnabled() {
2✔
1520
                setContainerEnv(devicePluginMainContainer, GDSEnabledEnvName, "true")
1✔
1521
                setContainerEnv(devicePluginMainContainer, MOFEDEnabledEnvName, "true")
1✔
1522
        }
1✔
1523

1524
        if config.GDRCopy != nil && config.GDRCopy.IsEnabled() {
2✔
1525
                setContainerEnv(devicePluginMainContainer, GDRCopyEnabledEnvName, "true")
1✔
1526
        }
1✔
1527

1528
        // apply plugin configuration through ConfigMap if one is provided
1529
        err = handleDevicePluginConfig(obj, config)
1✔
1530
        if err != nil {
1✔
1531
                return err
×
1532
        }
×
1533

1534
        setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
1✔
1535

1✔
1536
        // update env required for MIG support
1✔
1537
        applyMIGConfiguration(devicePluginMainContainer, config.MIG.Strategy)
1✔
1538

1✔
1539
        // update env required for CDI support
1✔
1540
        if config.CDI.IsEnabled() {
2✔
1541
                transformDevicePluginCtrForCDI(devicePluginMainContainer, config)
1✔
1542
        }
1✔
1543

1544
        // update MPS volumes and set MPS_ROOT env var if a custom MPS root is configured
1545
        if config.DevicePlugin.MPS != nil && config.DevicePlugin.MPS.Root != "" &&
1✔
1546
                config.DevicePlugin.MPS.Root != DefaultMPSRoot {
1✔
1547
                for i, volume := range obj.Spec.Template.Spec.Volumes {
×
1548
                        switch volume.Name {
×
1549
                        case "mps-root":
×
1550
                                obj.Spec.Template.Spec.Volumes[i].HostPath.Path = config.DevicePlugin.MPS.Root
×
1551
                        case "mps-shm":
×
1552
                                obj.Spec.Template.Spec.Volumes[i].HostPath.Path = filepath.Join(config.DevicePlugin.MPS.Root, "shm")
×
1553
                        }
1554
                }
1555
                setContainerEnv(devicePluginMainContainer, MPSRootEnvName, config.DevicePlugin.MPS.Root)
×
1556
        }
1557

1558
        if len(config.DevicePlugin.Env) > 0 {
2✔
1559
                for _, env := range config.DevicePlugin.Env {
2✔
1560
                        setContainerEnv(devicePluginMainContainer, env.Name, env.Value)
1✔
1561
                }
1✔
1562
        }
1563

1564
        return nil
1✔
1565
}
1566

1567
func TransformMPSControlDaemon(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1568
        // update validation container
1✔
1569
        err := transformValidationInitContainer(obj, config)
1✔
1570
        if err != nil {
1✔
1571
                return err
×
1572
        }
×
1573

1574
        image, err := gpuv1.ImagePath(&config.DevicePlugin)
1✔
1575
        if err != nil {
1✔
1576
                return err
×
1577
        }
×
1578
        imagePullPolicy := gpuv1.ImagePullPolicy(config.DevicePlugin.ImagePullPolicy)
1✔
1579

1✔
1580
        // update image path and imagePullPolicy for 'mps-control-daemon-mounts' initContainer
1✔
1581
        if initCtr := findContainerByName(obj.Spec.Template.Spec.InitContainers, "mps-control-daemon-mounts"); initCtr != nil {
2✔
1582
                initCtr.Image = image
1✔
1583
                initCtr.ImagePullPolicy = imagePullPolicy
1✔
1584
        }
1✔
1585

1586
        // update image path and imagePullPolicy for main container
1587
        mpsControlMainContainer := findContainerByName(obj.Spec.Template.Spec.Containers, "mps-control-daemon-ctr")
1✔
1588
        if mpsControlMainContainer == nil {
1✔
1589
                return fmt.Errorf("failed to find main container 'mps-control-daemon-ctr'")
×
1590
        }
×
1591
        mpsControlMainContainer.Image = image
1✔
1592
        mpsControlMainContainer.ImagePullPolicy = imagePullPolicy
1✔
1593

1✔
1594
        // set image pull secrets
1✔
1595
        if len(config.DevicePlugin.ImagePullSecrets) > 0 {
2✔
1596
                addPullSecrets(&obj.Spec.Template.Spec, config.DevicePlugin.ImagePullSecrets)
1✔
1597
        }
1✔
1598

1599
        // set resource limits
1600
        if config.DevicePlugin.Resources != nil {
1✔
1601
                // apply resource limits to all containers
×
1602
                for i := range obj.Spec.Template.Spec.Containers {
×
1603
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.DevicePlugin.Resources.Requests
×
1604
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.DevicePlugin.Resources.Limits
×
1605
                }
×
1606
        }
1607

1608
        // apply plugin configuration through ConfigMap if one is provided
1609
        err = handleDevicePluginConfig(obj, config)
1✔
1610
        if err != nil {
1✔
1611
                return err
×
1612
        }
×
1613

1614
        setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
1✔
1615

1✔
1616
        // update env required for MIG support
1✔
1617
        applyMIGConfiguration(mpsControlMainContainer, config.MIG.Strategy)
1✔
1618

1✔
1619
        // update MPS volumes if a custom MPS root is configured
1✔
1620
        if config.DevicePlugin.MPS != nil && config.DevicePlugin.MPS.Root != "" &&
1✔
1621
                config.DevicePlugin.MPS.Root != DefaultMPSRoot {
2✔
1622
                for i, volume := range obj.Spec.Template.Spec.Volumes {
2✔
1623
                        switch volume.Name {
1✔
1624
                        case "mps-root":
1✔
1625
                                obj.Spec.Template.Spec.Volumes[i].HostPath.Path = config.DevicePlugin.MPS.Root
1✔
1626
                        case "mps-shm":
1✔
1627
                                obj.Spec.Template.Spec.Volumes[i].HostPath.Path = filepath.Join(config.DevicePlugin.MPS.Root, "shm")
1✔
1628
                        }
1629
                }
1630
        }
1631

1632
        return nil
1✔
1633
}
1634

1635
// TransformSandboxDevicePlugin transforms sandbox-device-plugin daemonset with required config as per ClusterPolicy
1636
func TransformSandboxDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1637
        // update validation container
1✔
1638
        err := transformValidationInitContainer(obj, config)
1✔
1639
        if err != nil {
1✔
1640
                return err
×
1641
        }
×
1642
        // update image
1643
        image, err := gpuv1.ImagePath(&config.SandboxDevicePlugin)
1✔
1644
        if err != nil {
1✔
1645
                return err
×
1646
        }
×
1647
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
1648

1✔
1649
        // update image pull policy
1✔
1650
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.SandboxDevicePlugin.ImagePullPolicy)
1✔
1651
        // set image pull secrets
1✔
1652
        if len(config.SandboxDevicePlugin.ImagePullSecrets) > 0 {
2✔
1653
                addPullSecrets(&obj.Spec.Template.Spec, config.SandboxDevicePlugin.ImagePullSecrets)
1✔
1654
        }
1✔
1655
        // set resource limits
1656
        if config.SandboxDevicePlugin.Resources != nil {
1✔
1657
                // apply resource limits to all containers
×
1658
                for i := range obj.Spec.Template.Spec.Containers {
×
1659
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.SandboxDevicePlugin.Resources.Requests
×
1660
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.SandboxDevicePlugin.Resources.Limits
×
1661
                }
×
1662
        }
1663
        // set arguments if specified for device-plugin container
1664
        if len(config.SandboxDevicePlugin.Args) > 0 {
1✔
1665
                obj.Spec.Template.Spec.Containers[0].Args = config.SandboxDevicePlugin.Args
×
1666
        }
×
1667
        // set/append environment variables for device-plugin container
1668
        if len(config.SandboxDevicePlugin.Env) > 0 {
1✔
1669
                for _, env := range config.SandboxDevicePlugin.Env {
×
1670
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
×
1671
                }
×
1672
        }
1673
        return nil
1✔
1674
}
1675

1676
// TransformDCGMExporter transforms dcgm exporter daemonset with required config as per ClusterPolicy
1677
func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1678
        // update validation container
1✔
1679
        err := transformValidationInitContainer(obj, config)
1✔
1680
        if err != nil {
1✔
1681
                return err
×
1682
        }
×
1683

1684
        // update image
1685
        image, err := gpuv1.ImagePath(&config.DCGMExporter)
1✔
1686
        if err != nil {
1✔
1687
                return err
×
1688
        }
×
1689
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
1690

1✔
1691
        // update image pull policy
1✔
1692
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.DCGMExporter.ImagePullPolicy)
1✔
1693
        // set image pull secrets
1✔
1694
        if len(config.DCGMExporter.ImagePullSecrets) > 0 {
2✔
1695
                addPullSecrets(&obj.Spec.Template.Spec, config.DCGMExporter.ImagePullSecrets)
1✔
1696
        }
1✔
1697
        // set resource limits
1698
        if config.DCGMExporter.Resources != nil {
1✔
1699
                // apply resource limits to all containers
×
1700
                for i := range obj.Spec.Template.Spec.Containers {
×
1701
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.DCGMExporter.Resources.Requests
×
1702
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.DCGMExporter.Resources.Limits
×
1703
                }
×
1704
        }
1705
        // set arguments if specified for exporter container
1706
        if len(config.DCGMExporter.Args) > 0 {
2✔
1707
                obj.Spec.Template.Spec.Containers[0].Args = config.DCGMExporter.Args
1✔
1708
        }
1✔
1709

1710
        // check if DCGM hostengine is enabled as a separate Pod and setup env accordingly
1711
        if config.DCGM.IsEnabled() {
2✔
1712
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DCGMRemoteEngineEnvName, fmt.Sprintf("nvidia-dcgm:%d", DCGMDefaultPort))
1✔
1713
        } else {
2✔
1714
                // case for DCGM running on the host itself(DGX BaseOS)
1✔
1715
                remoteEngine := getContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DCGMRemoteEngineEnvName)
1✔
1716
                if remoteEngine != "" && strings.HasPrefix(remoteEngine, "localhost") {
2✔
1717
                        // enable hostNetwork for communication with external DCGM using localhost
1✔
1718
                        obj.Spec.Template.Spec.HostNetwork = true
1✔
1719
                        obj.Spec.Template.Spec.DNSPolicy = corev1.DNSClusterFirstWithHostNet
1✔
1720
                }
1✔
1721
        }
1722
        // set hostNetwork if specified for DCGM Exporter (if it is already enabled above,
1723
        // do not touch the value)
1724
        if config.DCGMExporter.IsHostNetworkEnabled() {
2✔
1725
                obj.Spec.Template.Spec.HostNetwork = true
1✔
1726
                obj.Spec.Template.Spec.DNSPolicy = corev1.DNSClusterFirstWithHostNet
1✔
1727
        }
1✔
1728

1729
        setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
1✔
1730

1✔
1731
        // set hostPID if specified for DCGM Exporter
1✔
1732
        if config.DCGMExporter.IsHostPIDEnabled() {
2✔
1733
                obj.Spec.Template.Spec.HostPID = true
1✔
1734
        }
1✔
1735

1736
        // configure HPC job mapping if enabled
1737
        if config.DCGMExporter.IsHPCJobMappingEnabled() {
2✔
1738
                jobMappingDir := config.DCGMExporter.GetHPCJobMappingDirectory()
1✔
1739
                if jobMappingDir == "" {
1✔
1740
                        jobMappingDir = gpuv1.DefaultDCGMJobMappingDir
×
1741
                }
×
1742

1743
                // set environment variable for DCGM Exporter
1744
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DCGM_HPC_JOB_MAPPING_DIR", jobMappingDir)
1✔
1745

1✔
1746
                // add volumeMount to main container
1✔
1747
                jobMappingVolMount := corev1.VolumeMount{Name: "hpc-job-mapping", ReadOnly: true, MountPath: jobMappingDir}
1✔
1748
                obj.Spec.Template.Spec.Containers[0].VolumeMounts = append(obj.Spec.Template.Spec.Containers[0].VolumeMounts, jobMappingVolMount)
1✔
1749

1✔
1750
                // add volume
1✔
1751
                jobMappingVolumeSource := corev1.VolumeSource{
1✔
1752
                        HostPath: &corev1.HostPathVolumeSource{
1✔
1753
                                Path: jobMappingDir,
1✔
1754
                                Type: ptr.To(corev1.HostPathDirectoryOrCreate),
1✔
1755
                        },
1✔
1756
                }
1✔
1757
                jobMappingVol := corev1.Volume{Name: "hpc-job-mapping", VolumeSource: jobMappingVolumeSource}
1✔
1758
                obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, jobMappingVol)
1✔
1759
        }
1760

1761
        // mount configmap for custom metrics if provided by user
1762
        if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" {
1✔
1763
                metricsConfigVolMount := corev1.VolumeMount{Name: "metrics-config", ReadOnly: true, MountPath: MetricsConfigMountPath, SubPath: MetricsConfigFileName}
×
1764
                obj.Spec.Template.Spec.Containers[0].VolumeMounts = append(obj.Spec.Template.Spec.Containers[0].VolumeMounts, metricsConfigVolMount)
×
1765

×
1766
                metricsConfigVolumeSource := corev1.VolumeSource{
×
1767
                        ConfigMap: &corev1.ConfigMapVolumeSource{
×
1768
                                LocalObjectReference: corev1.LocalObjectReference{
×
1769
                                        Name: config.DCGMExporter.MetricsConfig.Name,
×
1770
                                },
×
1771
                                Items: []corev1.KeyToPath{
×
1772
                                        {
×
1773
                                                Key:  MetricsConfigFileName,
×
1774
                                                Path: MetricsConfigFileName,
×
1775
                                        },
×
1776
                                },
×
1777
                        },
×
1778
                }
×
1779
                metricsConfigVol := corev1.Volume{Name: "metrics-config", VolumeSource: metricsConfigVolumeSource}
×
1780
                obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, metricsConfigVol)
×
1781

×
1782
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DCGM_EXPORTER_COLLECTORS", MetricsConfigMountPath)
×
1783
        }
×
1784

1785
        if n.openshift != "" {
2✔
1786
                if err = transformDCGMExporterForOpenShift(obj, config); err != nil {
1✔
1787
                        return fmt.Errorf("failed to transform dcgm-exporter for openshift: %w", err)
×
1788
                }
×
1789
        }
1790

1791
        for _, env := range config.DCGMExporter.Env {
2✔
1792
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
1✔
1793
        }
1✔
1794

1795
        return nil
1✔
1796
}
1797

1798
func transformDCGMExporterForOpenShift(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) error {
1✔
1799
        // Add initContainer for OCP to set proper SELinux context on /var/lib/kubelet/pod-resources
1✔
1800
        initImage, err := gpuv1.ImagePath(&config.Operator.InitContainer)
1✔
1801
        if err != nil {
1✔
1802
                return err
×
1803
        }
×
1804

1805
        initContainer := corev1.Container{}
1✔
1806
        if initImage != "" {
2✔
1807
                initContainer.Image = initImage
1✔
1808
        }
1✔
1809
        initContainer.Name = "init-pod-nvidia-node-status-exporter"
1✔
1810
        initContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.Operator.InitContainer.ImagePullPolicy)
1✔
1811
        initContainer.Command = []string{"/bin/entrypoint.sh"}
1✔
1812

1✔
1813
        // need CAP_SYS_ADMIN privileges for collecting pod specific resources
1✔
1814
        privileged := true
1✔
1815
        securityContext := &corev1.SecurityContext{
1✔
1816
                Privileged: &privileged,
1✔
1817
        }
1✔
1818

1✔
1819
        initContainer.SecurityContext = securityContext
1✔
1820

1✔
1821
        // Disable all constraints on the configurations required by NVIDIA container toolkit
1✔
1822
        setContainerEnv(&initContainer, NvidiaDisableRequireEnvName, "true")
1✔
1823

1✔
1824
        volMountSockName, volMountSockPath := "pod-gpu-resources", "/var/lib/kubelet/pod-resources"
1✔
1825
        volMountSock := corev1.VolumeMount{Name: volMountSockName, MountPath: volMountSockPath}
1✔
1826
        initContainer.VolumeMounts = append(initContainer.VolumeMounts, volMountSock)
1✔
1827

1✔
1828
        volMountConfigName, volMountConfigPath, volMountConfigSubPath := "init-config", "/bin/entrypoint.sh", "entrypoint.sh"
1✔
1829
        volMountConfig := corev1.VolumeMount{Name: volMountConfigName, ReadOnly: true, MountPath: volMountConfigPath, SubPath: volMountConfigSubPath}
1✔
1830
        initContainer.VolumeMounts = append(initContainer.VolumeMounts, volMountConfig)
1✔
1831

1✔
1832
        obj.Spec.Template.Spec.InitContainers = append(obj.Spec.Template.Spec.InitContainers, initContainer)
1✔
1833

1✔
1834
        volMountConfigKey, volMountConfigDefaultMode := "nvidia-dcgm-exporter", int32(0700)
1✔
1835
        initVol := corev1.Volume{Name: volMountConfigName, VolumeSource: corev1.VolumeSource{ConfigMap: &corev1.ConfigMapVolumeSource{LocalObjectReference: corev1.LocalObjectReference{Name: volMountConfigKey}, DefaultMode: &volMountConfigDefaultMode}}}
1✔
1836
        obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, initVol)
1✔
1837

1✔
1838
        return nil
1✔
1839
}
1840

1841
// TransformDCGM transforms dcgm daemonset with required config as per ClusterPolicy
1842
func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1843
        // update validation container
1✔
1844
        err := transformValidationInitContainer(obj, config)
1✔
1845
        if err != nil {
1✔
1846
                return err
×
1847
        }
×
1848
        // update image
1849
        image, err := gpuv1.ImagePath(&config.DCGM)
1✔
1850
        if err != nil {
1✔
1851
                return err
×
1852
        }
×
1853
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
1854
        // update image pull policy
1✔
1855
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.DCGM.ImagePullPolicy)
1✔
1856
        // set image pull secrets
1✔
1857
        if len(config.DCGM.ImagePullSecrets) > 0 {
2✔
1858
                addPullSecrets(&obj.Spec.Template.Spec, config.DCGM.ImagePullSecrets)
1✔
1859
        }
1✔
1860
        // set resource limits
1861
        if config.DCGM.Resources != nil {
2✔
1862
                // apply resource limits to all containers
1✔
1863
                for i := range obj.Spec.Template.Spec.Containers {
2✔
1864
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.DCGM.Resources.Requests
1✔
1865
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.DCGM.Resources.Limits
1✔
1866
                }
1✔
1867
        }
1868
        // set arguments if specified for exporter container
1869
        if len(config.DCGM.Args) > 0 {
2✔
1870
                obj.Spec.Template.Spec.Containers[0].Args = config.DCGM.Args
1✔
1871
        }
1✔
1872
        // set/append environment variables for exporter container
1873
        if len(config.DCGM.Env) > 0 {
2✔
1874
                for _, env := range config.DCGM.Env {
2✔
1875
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
1✔
1876
                }
1✔
1877
        }
1878

1879
        setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
1✔
1880

1✔
1881
        return nil
1✔
1882
}
1883

1884
// TransformMIGManager transforms MIG Manager daemonset with required config as per ClusterPolicy
1885
func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1886
        // update validation container
1✔
1887
        err := transformValidationInitContainer(obj, config)
1✔
1888
        if err != nil {
1✔
1889
                return err
×
1890
        }
×
1891

1892
        // update image
1893
        image, err := gpuv1.ImagePath(&config.MIGManager)
1✔
1894
        if err != nil {
1✔
1895
                return err
×
1896
        }
×
1897
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
1898

1✔
1899
        // update image pull policy
1✔
1900
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.MIGManager.ImagePullPolicy)
1✔
1901

1✔
1902
        // set image pull secrets
1✔
1903
        if len(config.MIGManager.ImagePullSecrets) > 0 {
2✔
1904
                addPullSecrets(&obj.Spec.Template.Spec, config.MIGManager.ImagePullSecrets)
1✔
1905
        }
1✔
1906

1907
        // set resource limits
1908
        if config.MIGManager.Resources != nil {
1✔
1909
                // apply resource limits to all containers
×
1910
                for i := range obj.Spec.Template.Spec.Containers {
×
1911
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.MIGManager.Resources.Requests
×
1912
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.MIGManager.Resources.Limits
×
1913
                }
×
1914
        }
1915

1916
        // set arguments if specified for mig-manager container
1917
        if len(config.MIGManager.Args) > 0 {
2✔
1918
                obj.Spec.Template.Spec.Containers[0].Args = config.MIGManager.Args
1✔
1919
        }
1✔
1920

1921
        setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
1✔
1922

1✔
1923
        // set ConfigMap name for "mig-parted-config" Volume
1✔
1924
        for i, vol := range obj.Spec.Template.Spec.Volumes {
1✔
1925
                if !strings.Contains(vol.Name, "mig-parted-config") {
×
1926
                        continue
×
1927
                }
1928

1929
                name, _ := gpuv1.GetConfigMapName(config.MIGManager.Config, MigPartedDefaultConfigMapName)
×
1930
                obj.Spec.Template.Spec.Volumes[i].ConfigMap.Name = name
×
1931
                break
×
1932
        }
1933

1934
        // set ConfigMap name for "gpu-clients" Volume
1935
        for i, vol := range obj.Spec.Template.Spec.Volumes {
1✔
1936
                if !strings.Contains(vol.Name, "gpu-clients") {
×
1937
                        continue
×
1938
                }
1939

1940
                name, _ := gpuv1.GetConfigMapName(config.MIGManager.GPUClientsConfig, MigDefaultGPUClientsConfigMapName)
×
1941
                obj.Spec.Template.Spec.Volumes[i].ConfigMap.Name = name
×
1942
                break
×
1943
        }
1944

1945
        // update env required for CDI support
1946
        if config.CDI.IsEnabled() {
2✔
1947
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true")
1✔
1948
                if config.Toolkit.IsEnabled() {
2✔
1949
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook"))
1✔
1950
                }
1✔
1951
        }
1952

1953
        if len(config.MIGManager.Env) > 0 {
2✔
1954
                for _, env := range config.MIGManager.Env {
2✔
1955
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
1✔
1956
                }
1✔
1957
        }
1958

1959
        return nil
1✔
1960
}
1961

1962
// TransformKataManager transforms Kata Manager daemonset with required config as per ClusterPolicy
1963
func TransformKataManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1964
        // update image
1✔
1965
        image, err := gpuv1.ImagePath(&config.KataManager)
1✔
1966
        if err != nil {
1✔
1967
                return err
×
1968
        }
×
1969
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
1970

1✔
1971
        // update image pull policy
1✔
1972
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.KataManager.ImagePullPolicy)
1✔
1973

1✔
1974
        // set image pull secrets
1✔
1975
        if len(config.KataManager.ImagePullSecrets) > 0 {
2✔
1976
                addPullSecrets(&obj.Spec.Template.Spec, config.KataManager.ImagePullSecrets)
1✔
1977
        }
1✔
1978

1979
        // set resource limits
1980
        if config.KataManager.Resources != nil {
1✔
1981
                // apply resource limits to all containers
×
1982
                for i := range obj.Spec.Template.Spec.Containers {
×
1983
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.KataManager.Resources.Requests
×
1984
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.KataManager.Resources.Limits
×
1985
                }
×
1986
        }
1987

1988
        // set arguments if specified for mig-manager container
1989
        if len(config.KataManager.Args) > 0 {
2✔
1990
                obj.Spec.Template.Spec.Containers[0].Args = config.KataManager.Args
1✔
1991
        }
1✔
1992

1993
        // mount artifactsDir
1994
        artifactsDir := DefaultKataArtifactsDir
1✔
1995
        if config.KataManager.Config.ArtifactsDir != "" {
2✔
1996
                artifactsDir = config.KataManager.Config.ArtifactsDir
1✔
1997
        }
1✔
1998

1999
        // set env used by readinessProbe to determine path to kata-manager pid file.
2000
        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "KATA_ARTIFACTS_DIR", artifactsDir)
1✔
2001

1✔
2002
        artifactsVolMount := corev1.VolumeMount{Name: "kata-artifacts", MountPath: artifactsDir}
1✔
2003
        obj.Spec.Template.Spec.Containers[0].VolumeMounts = append(obj.Spec.Template.Spec.Containers[0].VolumeMounts, artifactsVolMount)
1✔
2004

1✔
2005
        artifactsVol := corev1.Volume{Name: "kata-artifacts", VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: artifactsDir, Type: ptr.To(corev1.HostPathDirectoryOrCreate)}}}
1✔
2006
        obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, artifactsVol)
1✔
2007

1✔
2008
        // Compute hash of kata manager config and add an annotation with the value.
1✔
2009
        // If the kata config changes, a new revision of the daemonset will be
1✔
2010
        // created and thus the kata-manager pods will restart with the updated config.
1✔
2011
        hash := utils.GetObjectHash(config.KataManager.Config)
1✔
2012

1✔
2013
        if obj.Spec.Template.Annotations == nil {
2✔
2014
                obj.Spec.Template.Annotations = make(map[string]string)
1✔
2015
        }
1✔
2016
        obj.Spec.Template.Annotations[KataManagerAnnotationHashKey] = hash
1✔
2017

1✔
2018
        if len(config.KataManager.Env) > 0 {
2✔
2019
                for _, env := range config.KataManager.Env {
2✔
2020
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
1✔
2021
                }
1✔
2022
        }
2023

2024
        // mount containerd config and socket
2025
        // setup mounts for runtime config file
2026
        runtime := n.runtime.String()
1✔
2027
        // kata manager is the only container in this daemonset
1✔
2028
        err = transformForRuntime(obj, config, runtime, &obj.Spec.Template.Spec.Containers[0])
1✔
2029
        if err != nil {
1✔
2030
                return fmt.Errorf("error transforming kata-manager daemonset : %w", err)
×
2031
        }
×
2032

2033
        return nil
1✔
2034
}
2035

2036
// TransformVFIOManager transforms VFIO-PCI Manager daemonset with required config as per ClusterPolicy
2037
func TransformVFIOManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
2038
        // update k8s-driver-manager initContainer
1✔
2039
        err := transformDriverManagerInitContainer(obj, &config.VFIOManager.DriverManager, nil)
1✔
2040
        if err != nil {
1✔
2041
                return fmt.Errorf("failed to transform k8s-driver-manager initContainer for VFIO Manager: %v", err)
×
2042
        }
×
2043

2044
        // update image
2045
        image, err := gpuv1.ImagePath(&config.VFIOManager)
1✔
2046
        if err != nil {
1✔
2047
                return err
×
2048
        }
×
2049
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
2050

1✔
2051
        // update image pull policy
1✔
2052
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.VFIOManager.ImagePullPolicy)
1✔
2053

1✔
2054
        // set image pull secrets
1✔
2055
        if len(config.VFIOManager.ImagePullSecrets) > 0 {
2✔
2056
                addPullSecrets(&obj.Spec.Template.Spec, config.VFIOManager.ImagePullSecrets)
1✔
2057
        }
1✔
2058

2059
        // set resource limits
2060
        if config.VFIOManager.Resources != nil {
2✔
2061
                // apply resource limits to all containers
1✔
2062
                for i := range obj.Spec.Template.Spec.Containers {
2✔
2063
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.VFIOManager.Resources.Requests
1✔
2064
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.VFIOManager.Resources.Limits
1✔
2065
                }
1✔
2066
        }
2067

2068
        // set arguments if specified for mig-manager container
2069
        if len(config.VFIOManager.Args) > 0 {
2✔
2070
                obj.Spec.Template.Spec.Containers[0].Args = config.VFIOManager.Args
1✔
2071
        }
1✔
2072

2073
        // set/append environment variables for mig-manager container
2074
        if len(config.VFIOManager.Env) > 0 {
2✔
2075
                for _, env := range config.VFIOManager.Env {
2✔
2076
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
1✔
2077
                }
1✔
2078
        }
2079

2080
        // Compute configuration digest after all transformations are complete
2081
        digest := utils.GetObjectHash(obj.Spec)
1✔
2082

1✔
2083
        // Set the computed digest in driver-manager initContainer
1✔
2084
        driverManagerContainer := findContainerByName(obj.Spec.Template.Spec.InitContainers, "k8s-driver-manager")
1✔
2085
        setContainerEnv(driverManagerContainer, "DRIVER_CONFIG_DIGEST", digest)
1✔
2086

1✔
2087
        return nil
1✔
2088
}
2089

2090
// TransformCCManager transforms CC Manager daemonset with required config as per ClusterPolicy
2091
func TransformCCManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
2092
        // update image
1✔
2093
        image, err := gpuv1.ImagePath(&config.CCManager)
1✔
2094
        if err != nil {
1✔
2095
                return err
×
2096
        }
×
2097
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
2098

1✔
2099
        // update image pull policy
1✔
2100
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.CCManager.ImagePullPolicy)
1✔
2101

1✔
2102
        // set image pull secrets
1✔
2103
        if len(config.CCManager.ImagePullSecrets) > 0 {
2✔
2104
                addPullSecrets(&obj.Spec.Template.Spec, config.CCManager.ImagePullSecrets)
1✔
2105
        }
1✔
2106

2107
        // set resource limits
2108
        if config.CCManager.Resources != nil {
2✔
2109
                // apply resource limits to all containers
1✔
2110
                for i := range obj.Spec.Template.Spec.Containers {
2✔
2111
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.CCManager.Resources.Requests
1✔
2112
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.CCManager.Resources.Limits
1✔
2113
                }
1✔
2114
        }
2115

2116
        // set arguments if specified for cc-manager container
2117
        if len(config.CCManager.Args) > 0 {
2✔
2118
                obj.Spec.Template.Spec.Containers[0].Args = config.CCManager.Args
1✔
2119
        }
1✔
2120

2121
        // set default cc mode env
2122
        if config.CCManager.DefaultMode != "" {
2✔
2123
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DefaultCCModeEnvName, config.CCManager.DefaultMode)
1✔
2124
        }
1✔
2125

2126
        // set/append environment variables for cc-manager container
2127
        if len(config.CCManager.Env) > 0 {
2✔
2128
                for _, env := range config.CCManager.Env {
2✔
2129
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
1✔
2130
                }
1✔
2131
        }
2132

2133
        return nil
1✔
2134
}
2135

2136
// TransformVGPUDeviceManager transforms VGPU Device Manager daemonset with required config as per ClusterPolicy
2137
func TransformVGPUDeviceManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
2138
        // update validation container
1✔
2139
        err := transformValidationInitContainer(obj, config)
1✔
2140
        if err != nil {
1✔
2141
                return err
×
2142
        }
×
2143

2144
        // update image
2145
        image, err := gpuv1.ImagePath(&config.VGPUDeviceManager)
1✔
2146
        if err != nil {
1✔
2147
                return err
×
2148
        }
×
2149
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
2150

1✔
2151
        // update image pull policy
1✔
2152
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.VGPUDeviceManager.ImagePullPolicy)
1✔
2153

1✔
2154
        // set image pull secrets
1✔
2155
        if len(config.VGPUDeviceManager.ImagePullSecrets) > 0 {
2✔
2156
                addPullSecrets(&obj.Spec.Template.Spec, config.VGPUDeviceManager.ImagePullSecrets)
1✔
2157
        }
1✔
2158

2159
        // set resource limits
2160
        if config.VGPUDeviceManager.Resources != nil {
2✔
2161
                // apply resource limits to all containers
1✔
2162
                for i := range obj.Spec.Template.Spec.Containers {
2✔
2163
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.VGPUDeviceManager.Resources.Requests
1✔
2164
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.VGPUDeviceManager.Resources.Limits
1✔
2165
                }
1✔
2166
        }
2167

2168
        // set arguments if specified for mig-manager container
2169
        if len(config.VGPUDeviceManager.Args) > 0 {
2✔
2170
                obj.Spec.Template.Spec.Containers[0].Args = config.VGPUDeviceManager.Args
1✔
2171
        }
1✔
2172

2173
        // set/append environment variables for mig-manager container
2174
        if len(config.VGPUDeviceManager.Env) > 0 {
2✔
2175
                for _, env := range config.VGPUDeviceManager.Env {
2✔
2176
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
1✔
2177
                }
1✔
2178
        }
2179

2180
        // set ConfigMap name for "vgpu-config" Volume
2181
        for i, vol := range obj.Spec.Template.Spec.Volumes {
1✔
2182
                if !strings.Contains(vol.Name, "vgpu-config") {
×
2183
                        continue
×
2184
                }
2185

2186
                name, _ := gpuv1.GetConfigMapName(config.VGPUDeviceManager.Config, VgpuDMDefaultConfigMapName)
×
2187
                obj.Spec.Template.Spec.Volumes[i].ConfigMap.Name = name
×
2188
                break
×
2189
        }
2190

2191
        // set name of default vGPU device configuration. The default configuration is applied if the node
2192
        // is not labelled with a specific configuration
2193
        defaultConfig := VgpuDMDefaultConfigName
1✔
2194
        if config.VGPUDeviceManager.Config != nil && config.VGPUDeviceManager.Config.Default != "" {
2✔
2195
                defaultConfig = config.VGPUDeviceManager.Config.Default
1✔
2196
        }
1✔
2197
        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DEFAULT_VGPU_CONFIG", defaultConfig)
1✔
2198

1✔
2199
        return nil
1✔
2200
}
2201

2202
// transformValidatorSecurityContext updates the security context for a validator
2203
// container so that it runs as uid 0. Some of the validations run commands
2204
// that require root privileges (e.g. chroot). In addition, all validations
2205
// create / delete status files in the '/run/nvidia/validations' host path
2206
// volume. This directory is initially created by the kubelet and thus has
2207
// the same group and ownership as the kubelet.
2208
func transformValidatorSecurityContext(ctr *corev1.Container) {
1✔
2209
        if ctr.SecurityContext == nil {
2✔
2210
                ctr.SecurityContext = &corev1.SecurityContext{}
1✔
2211
        }
1✔
2212
        ctr.SecurityContext.RunAsUser = rootUID
1✔
2213
}
2214

2215
// TransformValidator transforms nvidia-operator-validator daemonset with required config as per ClusterPolicy
2216
func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
2217
        err := TransformValidatorShared(obj, config)
1✔
2218
        if err != nil {
2✔
2219
                return fmt.Errorf("%v", err)
1✔
2220
        }
1✔
2221

2222
        setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
1✔
2223

1✔
2224
        var validatorErr error
1✔
2225
        // apply changes for individual component validators(initContainers)
1✔
2226
        components := []string{
1✔
2227
                "driver",
1✔
2228
                "nvidia-fs",
1✔
2229
                "gdrcopy",
1✔
2230
                "toolkit",
1✔
2231
                "cuda",
1✔
2232
                "plugin",
1✔
2233
        }
1✔
2234

1✔
2235
        for _, component := range components {
2✔
2236
                if err := TransformValidatorComponent(config, &obj.Spec.Template.Spec, component); err != nil {
1✔
2237
                        validatorErr = errors.Join(validatorErr, err)
×
2238
                }
×
2239
        }
2240

2241
        if validatorErr != nil {
1✔
2242
                n.logger.Info("WARN: errors transforming the validator containers: %v", validatorErr)
×
2243
        }
×
2244

2245
        return nil
1✔
2246
}
2247

2248
// TransformSandboxValidator transforms nvidia-sandbox-validator daemonset with required config as per ClusterPolicy
2249
func TransformSandboxValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
2250
        err := TransformValidatorShared(obj, config)
1✔
2251
        if err != nil {
2✔
2252
                return fmt.Errorf("%v", err)
1✔
2253
        }
1✔
2254

2255
        var validatorErr error
1✔
2256
        // apply changes for individual component validators(initContainers)
1✔
2257
        components := []string{
1✔
2258
                "cc-manager",
1✔
2259
                "vfio-pci",
1✔
2260
                "vgpu-manager",
1✔
2261
                "vgpu-devices",
1✔
2262
        }
1✔
2263

1✔
2264
        for _, component := range components {
2✔
2265
                if err := TransformValidatorComponent(config, &obj.Spec.Template.Spec, component); err != nil {
1✔
2266
                        validatorErr = errors.Join(validatorErr, err)
×
2267
                }
×
2268
        }
2269

2270
        if validatorErr != nil {
1✔
2271
                n.logger.Info("WARN: errors transforming the validator containers: %v", validatorErr)
×
2272
        }
×
2273

2274
        return nil
1✔
2275
}
2276

2277
// TransformValidatorShared applies general transformations to the validator daemonset with required config as per ClusterPolicy
2278
func TransformValidatorShared(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) error {
1✔
2279
        // update image
1✔
2280
        image, err := gpuv1.ImagePath(&config.Validator)
1✔
2281
        if err != nil {
2✔
2282
                return err
1✔
2283
        }
1✔
2284
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
2285
        // update image pull policy
1✔
2286
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.Validator.ImagePullPolicy)
1✔
2287
        // set image pull secrets
1✔
2288
        if len(config.Validator.ImagePullSecrets) > 0 {
2✔
2289
                addPullSecrets(&obj.Spec.Template.Spec, config.Validator.ImagePullSecrets)
1✔
2290
        }
1✔
2291
        // set resource limits
2292
        if config.Validator.Resources != nil {
2✔
2293
                // apply resource limits to all containers
1✔
2294
                for i := range obj.Spec.Template.Spec.Containers {
2✔
2295
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.Validator.Resources.Requests
1✔
2296
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.Validator.Resources.Limits
1✔
2297
                }
1✔
2298
        }
2299
        // set arguments if specified for validator container
2300
        if len(config.Validator.Args) > 0 {
2✔
2301
                obj.Spec.Template.Spec.Containers[0].Args = config.Validator.Args
1✔
2302
        }
1✔
2303
        // set/append environment variables for validator container
2304
        if len(config.Validator.Env) > 0 {
2✔
2305
                for _, env := range config.Validator.Env {
2✔
2306
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
1✔
2307
                }
1✔
2308
        }
2309
        // update the security context for the validator container
2310
        transformValidatorSecurityContext(&obj.Spec.Template.Spec.Containers[0])
1✔
2311

1✔
2312
        return nil
1✔
2313
}
2314

2315
// TransformValidatorComponent applies changes to given validator component
2316
func TransformValidatorComponent(config *gpuv1.ClusterPolicySpec, podSpec *corev1.PodSpec, component string) error {
1✔
2317
        for i, initContainer := range podSpec.InitContainers {
2✔
2318
                // skip if not component validation initContainer
1✔
2319
                if !strings.Contains(initContainer.Name, fmt.Sprintf("%s-validation", component)) {
2✔
2320
                        continue
1✔
2321
                }
2322
                // update validation image
2323
                image, err := gpuv1.ImagePath(&config.Validator)
1✔
2324
                if err != nil {
2✔
2325
                        return err
1✔
2326
                }
1✔
2327
                podSpec.InitContainers[i].Image = image
1✔
2328
                // update validation image pull policy
1✔
2329
                if config.Validator.ImagePullPolicy != "" {
2✔
2330
                        podSpec.InitContainers[i].ImagePullPolicy = gpuv1.ImagePullPolicy(config.Validator.ImagePullPolicy)
1✔
2331
                }
1✔
2332
                // update the security context for the validator container
2333
                transformValidatorSecurityContext(&podSpec.InitContainers[i])
1✔
2334

1✔
2335
                switch component {
1✔
2336
                case "cuda":
1✔
2337
                        // set additional env to indicate image, pullSecrets to spin-off cuda validation workload pod.
1✔
2338
                        setContainerEnv(&(podSpec.InitContainers[i]), ValidatorImageEnvName, image)
1✔
2339
                        setContainerEnv(&(podSpec.InitContainers[i]), ValidatorImagePullPolicyEnvName, config.Validator.ImagePullPolicy)
1✔
2340
                        var pullSecrets string
1✔
2341
                        if len(config.Validator.ImagePullSecrets) > 0 {
2✔
2342
                                pullSecrets = strings.Join(config.Validator.ImagePullSecrets, ",")
1✔
2343
                                setContainerEnv(&(podSpec.InitContainers[i]), ValidatorImagePullSecretsEnvName, pullSecrets)
1✔
2344
                        }
1✔
2345
                        if podSpec.RuntimeClassName != nil {
2✔
2346
                                setContainerEnv(&(podSpec.InitContainers[i]), ValidatorRuntimeClassEnvName, *podSpec.RuntimeClassName)
1✔
2347
                        }
1✔
2348
                        // set/append environment variables for cuda-validation container
2349
                        if len(config.Validator.CUDA.Env) > 0 {
2✔
2350
                                for _, env := range config.Validator.CUDA.Env {
2✔
2351
                                        setContainerEnv(&(podSpec.InitContainers[i]), env.Name, env.Value)
1✔
2352
                                }
1✔
2353
                        }
2354
                case "plugin":
1✔
2355
                        // remove plugin init container from validator Daemonset if it is not enabled
1✔
2356
                        if !config.DevicePlugin.IsEnabled() {
2✔
2357
                                podSpec.InitContainers = append(podSpec.InitContainers[:i], podSpec.InitContainers[i+1:]...)
1✔
2358
                                return nil
1✔
2359
                        }
1✔
2360
                        // set additional env to indicate image, pullSecrets to spin-off plugin validation workload pod.
2361
                        setContainerEnv(&(podSpec.InitContainers[i]), ValidatorImageEnvName, image)
1✔
2362
                        setContainerEnv(&(podSpec.InitContainers[i]), ValidatorImagePullPolicyEnvName, config.Validator.ImagePullPolicy)
1✔
2363
                        var pullSecrets string
1✔
2364
                        if len(config.Validator.ImagePullSecrets) > 0 {
2✔
2365
                                pullSecrets = strings.Join(config.Validator.ImagePullSecrets, ",")
1✔
2366
                                setContainerEnv(&(podSpec.InitContainers[i]), ValidatorImagePullSecretsEnvName, pullSecrets)
1✔
2367
                        }
1✔
2368
                        if podSpec.RuntimeClassName != nil {
2✔
2369
                                setContainerEnv(&(podSpec.InitContainers[i]), ValidatorRuntimeClassEnvName, *podSpec.RuntimeClassName)
1✔
2370
                        }
1✔
2371
                        // apply mig-strategy env to spin off plugin-validation workload pod
2372
                        setContainerEnv(&(podSpec.InitContainers[i]), MigStrategyEnvName, string(config.MIG.Strategy))
1✔
2373
                        // set/append environment variables for plugin-validation container
1✔
2374
                        if len(config.Validator.Plugin.Env) > 0 {
2✔
2375
                                for _, env := range config.Validator.Plugin.Env {
2✔
2376
                                        setContainerEnv(&(podSpec.InitContainers[i]), env.Name, env.Value)
1✔
2377
                                }
1✔
2378
                        }
2379
                case "driver":
1✔
2380
                        // set/append environment variables for driver-validation container
1✔
2381
                        if len(config.Validator.Driver.Env) > 0 {
2✔
2382
                                for _, env := range config.Validator.Driver.Env {
2✔
2383
                                        setContainerEnv(&(podSpec.InitContainers[i]), env.Name, env.Value)
1✔
2384
                                }
1✔
2385
                        }
2386
                case "nvidia-fs":
1✔
2387
                        if config.GPUDirectStorage == nil || !config.GPUDirectStorage.IsEnabled() {
2✔
2388
                                // remove  nvidia-fs init container from validator Daemonset if GDS is not enabled
1✔
2389
                                podSpec.InitContainers = append(podSpec.InitContainers[:i], podSpec.InitContainers[i+1:]...)
1✔
2390
                                return nil
1✔
2391
                        }
1✔
2392
                case "gdrcopy":
×
2393
                        if !config.IsGDRCopyEnabled() {
×
2394
                                // remove gdrcopy init container from validator Daemonset if GDRCopy is not enabled
×
2395
                                podSpec.InitContainers = append(podSpec.InitContainers[:i], podSpec.InitContainers[i+1:]...)
×
2396
                                return nil
×
2397
                        }
×
2398
                case "cc-manager":
1✔
2399
                        if !config.CCManager.IsEnabled() {
2✔
2400
                                // remove  cc-manager init container from validator Daemonset if it is not enabled
1✔
2401
                                podSpec.InitContainers = append(podSpec.InitContainers[:i], podSpec.InitContainers[i+1:]...)
1✔
2402
                                return nil
1✔
2403
                        }
1✔
2404
                case "toolkit":
1✔
2405
                        // set/append environment variables for toolkit-validation container
1✔
2406
                        if len(config.Validator.Toolkit.Env) > 0 {
2✔
2407
                                for _, env := range config.Validator.Toolkit.Env {
2✔
2408
                                        setContainerEnv(&(podSpec.InitContainers[i]), env.Name, env.Value)
1✔
2409
                                }
1✔
2410
                        }
2411
                case "vfio-pci":
1✔
2412
                        // set/append environment variables for vfio-pci-validation container
1✔
2413
                        setContainerEnv(&(podSpec.InitContainers[i]), "DEFAULT_GPU_WORKLOAD_CONFIG", defaultGPUWorkloadConfig)
1✔
2414
                        if len(config.Validator.VFIOPCI.Env) > 0 {
2✔
2415
                                for _, env := range config.Validator.VFIOPCI.Env {
2✔
2416
                                        setContainerEnv(&(podSpec.InitContainers[i]), env.Name, env.Value)
1✔
2417
                                }
1✔
2418
                        }
2419
                case "vgpu-manager":
1✔
2420
                        // set/append environment variables for vgpu-manager-validation container
1✔
2421
                        setContainerEnv(&(podSpec.InitContainers[i]), "DEFAULT_GPU_WORKLOAD_CONFIG", defaultGPUWorkloadConfig)
1✔
2422
                        if len(config.Validator.VGPUManager.Env) > 0 {
2✔
2423
                                for _, env := range config.Validator.VGPUManager.Env {
2✔
2424
                                        setContainerEnv(&(podSpec.InitContainers[i]), env.Name, env.Value)
1✔
2425
                                }
1✔
2426
                        }
2427
                case "vgpu-devices":
1✔
2428
                        // set/append environment variables for vgpu-devices-validation container
1✔
2429
                        setContainerEnv(&(podSpec.InitContainers[i]), "DEFAULT_GPU_WORKLOAD_CONFIG", defaultGPUWorkloadConfig)
1✔
2430
                        if len(config.Validator.VGPUDevices.Env) > 0 {
2✔
2431
                                for _, env := range config.Validator.VGPUDevices.Env {
2✔
2432
                                        setContainerEnv(&(podSpec.InitContainers[i]), env.Name, env.Value)
1✔
2433
                                }
1✔
2434
                        }
2435
                default:
×
2436
                        return fmt.Errorf("invalid component provided to apply validator changes")
×
2437
                }
2438
        }
2439
        return nil
1✔
2440
}
2441

2442
// TransformNodeStatusExporter transforms the node-status-exporter daemonset with required config as per ClusterPolicy
2443
func TransformNodeStatusExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
2444
        // update validation container
1✔
2445
        err := transformValidationInitContainer(obj, config)
1✔
2446
        if err != nil {
1✔
2447
                return err
×
2448
        }
×
2449

2450
        // update image
2451
        image, err := gpuv1.ImagePath(&config.NodeStatusExporter)
1✔
2452
        if err != nil {
2✔
2453
                return err
1✔
2454
        }
1✔
2455
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
2456

1✔
2457
        // update image pull policy
1✔
2458
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.NodeStatusExporter.ImagePullPolicy)
1✔
2459

1✔
2460
        // set image pull secrets
1✔
2461
        if len(config.NodeStatusExporter.ImagePullSecrets) > 0 {
1✔
2462
                addPullSecrets(&obj.Spec.Template.Spec, config.NodeStatusExporter.ImagePullSecrets)
×
2463
        }
×
2464

2465
        // set resource limits
2466
        if config.NodeStatusExporter.Resources != nil {
1✔
2467
                // apply resource limits to all containers
×
2468
                for i := range obj.Spec.Template.Spec.Containers {
×
2469
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.NodeStatusExporter.Resources.Requests
×
2470
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.NodeStatusExporter.Resources.Limits
×
2471
                }
×
2472
        }
2473

2474
        // set arguments if specified for driver container
2475
        if len(config.NodeStatusExporter.Args) > 0 {
1✔
2476
                obj.Spec.Template.Spec.Containers[0].Args = config.NodeStatusExporter.Args
×
2477
        }
×
2478

2479
        // set/append environment variables for exporter container
2480
        if len(config.NodeStatusExporter.Env) > 0 {
1✔
2481
                for _, env := range config.NodeStatusExporter.Env {
×
2482
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
×
2483
                }
×
2484
        }
2485

2486
        // update the security context for the node status exporter container.
2487
        transformValidatorSecurityContext(&obj.Spec.Template.Spec.Containers[0])
1✔
2488

1✔
2489
        return nil
1✔
2490
}
2491

2492
// getRuntimeConfigFiles returns the path to the top-level and drop-in config files that
2493
// should be used when configuring the specified container runtime.
2494
func getRuntimeConfigFiles(c *corev1.Container, runtime string) (string, string, error) {
1✔
2495
        switch runtime {
1✔
2496
        case gpuv1.Docker.String():
1✔
2497
                topLevelConfigFile := DefaultDockerConfigFile
1✔
2498
                if value := getContainerEnv(c, "DOCKER_CONFIG"); value != "" {
2✔
2499
                        topLevelConfigFile = value
1✔
2500
                } else if value := getContainerEnv(c, "RUNTIME_CONFIG"); value != "" {
3✔
2501
                        topLevelConfigFile = value
1✔
2502
                }
1✔
2503
                // Docker does not support drop-in files.
2504
                return topLevelConfigFile, "", nil
1✔
2505
        case gpuv1.Containerd.String():
1✔
2506
                topLevelConfigFile := DefaultContainerdConfigFile
1✔
2507
                if value := getContainerEnv(c, "CONTAINERD_CONFIG"); value != "" {
2✔
2508
                        topLevelConfigFile = value
1✔
2509
                } else if value := getContainerEnv(c, "RUNTIME_CONFIG"); value != "" {
3✔
2510
                        topLevelConfigFile = value
1✔
2511
                }
1✔
2512
                dropInConfigFile := DefaultContainerdDropInConfigFile
1✔
2513
                if value := getContainerEnv(c, "RUNTIME_DROP_IN_CONFIG"); value != "" {
2✔
2514
                        dropInConfigFile = value
1✔
2515
                }
1✔
2516
                return topLevelConfigFile, dropInConfigFile, nil
1✔
2517
        case gpuv1.CRIO.String():
1✔
2518
                // TODO: We should still allow the top-level config to be specified
1✔
2519
                topLevelConfigFile := DefaultCRIOConfigFile
1✔
2520
                if value := getContainerEnv(c, "CRIO_CONFIG"); value != "" {
2✔
2521
                        topLevelConfigFile = value
1✔
2522
                } else if value := getContainerEnv(c, "RUNTIME_CONFIG"); value != "" {
3✔
2523
                        topLevelConfigFile = value
1✔
2524
                }
1✔
2525
                dropInConfigFile := DefaultCRIODropInConfigFile
1✔
2526
                if value := getContainerEnv(c, "RUNTIME_DROP_IN_CONFIG"); value != "" {
2✔
2527
                        dropInConfigFile = value
1✔
2528
                }
1✔
2529
                return topLevelConfigFile, dropInConfigFile, nil
1✔
2530
        default:
1✔
2531
                return "", "", fmt.Errorf("invalid runtime: %s", runtime)
1✔
2532
        }
2533
}
2534

2535
// get runtime(docker, containerd) socket file path based on toolkit container env or default
2536
func getRuntimeSocketFile(c *corev1.Container, runtime string) (string, error) {
1✔
2537
        var runtimeSocketFile string
1✔
2538
        switch runtime {
1✔
2539
        case gpuv1.Docker.String():
1✔
2540
                runtimeSocketFile = DefaultDockerSocketFile
1✔
2541
                if getContainerEnv(c, "DOCKER_SOCKET") != "" {
1✔
2542
                        runtimeSocketFile = getContainerEnv(c, "DOCKER_SOCKET")
×
2543
                }
×
2544
        case gpuv1.Containerd.String():
1✔
2545
                runtimeSocketFile = DefaultContainerdSocketFile
1✔
2546
                if getContainerEnv(c, "CONTAINERD_SOCKET") != "" {
2✔
2547
                        runtimeSocketFile = getContainerEnv(c, "CONTAINERD_SOCKET")
1✔
2548
                }
1✔
2549
        case gpuv1.CRIO.String():
1✔
2550
                runtimeSocketFile = ""
1✔
2551
        default:
×
2552
                return "", fmt.Errorf("invalid runtime: %s", runtime)
×
2553
        }
2554

2555
        return runtimeSocketFile, nil
1✔
2556
}
2557

2558
func getContainerEnv(c *corev1.Container, key string) string {
1✔
2559
        for _, val := range c.Env {
2✔
2560
                if val.Name == key {
2✔
2561
                        return val.Value
1✔
2562
                }
1✔
2563
        }
2564
        return ""
1✔
2565
}
2566

2567
func setContainerEnv(c *corev1.Container, key, value string) {
1✔
2568
        for i, val := range c.Env {
2✔
2569
                if val.Name != key {
2✔
2570
                        continue
1✔
2571
                }
2572

2573
                c.Env[i].Value = value
1✔
2574
                return
1✔
2575
        }
2576
        c.Env = append(c.Env, corev1.EnvVar{Name: key, Value: value})
1✔
2577
}
2578

2579
// findContainerByName returns a pointer to the container with the given name, or nil if not found.
2580
func findContainerByName(containers []corev1.Container, name string) *corev1.Container {
1✔
2581
        for i := range containers {
2✔
2582
                if containers[i].Name == name {
2✔
2583
                        return &containers[i]
1✔
2584
                }
1✔
2585
        }
2586
        return nil
1✔
2587
}
2588

2589
func getRuntimeClassName(config *gpuv1.ClusterPolicySpec) string {
1✔
2590
        if config.Operator.RuntimeClass != "" {
2✔
2591
                return config.Operator.RuntimeClass
1✔
2592
        }
1✔
2593
        return DefaultRuntimeClass
1✔
2594
}
2595

2596
func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec, runtime gpuv1.Runtime) {
1✔
2597
        if !config.CDI.IsEnabled() && runtime == gpuv1.CRIO {
1✔
2598
                return
×
2599
        }
×
2600
        runtimeClassName := getRuntimeClassName(config)
1✔
2601
        podSpec.RuntimeClassName = &runtimeClassName
1✔
2602
}
2603

2604
func setContainerProbe(container *corev1.Container, probe *gpuv1.ContainerProbeSpec, probeType ContainerProbe) {
1✔
2605
        var containerProbe *corev1.Probe
1✔
2606

1✔
2607
        // determine probe type to update
1✔
2608
        switch probeType {
1✔
2609
        case Startup:
1✔
2610
                containerProbe = container.StartupProbe
1✔
2611
        case Liveness:
×
2612
                containerProbe = container.LivenessProbe
×
2613
        case Readiness:
×
2614
                containerProbe = container.ReadinessProbe
×
2615
        }
2616

2617
        // set probe parameters if specified
2618
        if probe.InitialDelaySeconds != 0 {
2✔
2619
                containerProbe.InitialDelaySeconds = probe.InitialDelaySeconds
1✔
2620
        }
1✔
2621
        if probe.TimeoutSeconds != 0 {
2✔
2622
                containerProbe.TimeoutSeconds = probe.TimeoutSeconds
1✔
2623
        }
1✔
2624
        if probe.FailureThreshold != 0 {
2✔
2625
                containerProbe.FailureThreshold = probe.FailureThreshold
1✔
2626
        }
1✔
2627
        if probe.SuccessThreshold != 0 {
1✔
2628
                containerProbe.SuccessThreshold = probe.SuccessThreshold
×
2629
        }
×
2630
        if probe.PeriodSeconds != 0 {
2✔
2631
                containerProbe.PeriodSeconds = probe.PeriodSeconds
1✔
2632
        }
1✔
2633
}
2634

2635
// applies MIG related configuration env to container spec
2636
func applyMIGConfiguration(c *corev1.Container, strategy gpuv1.MIGStrategy) {
1✔
2637
        // if not set then let plugin decide this per node(default: none)
1✔
2638
        if strategy == "" {
2✔
2639
                setContainerEnv(c, "NVIDIA_MIG_MONITOR_DEVICES", "all")
1✔
2640
                return
1✔
2641
        }
1✔
2642

2643
        setContainerEnv(c, "MIG_STRATEGY", string(strategy))
1✔
2644
        if strategy != gpuv1.MIGStrategyNone {
2✔
2645
                setContainerEnv(c, "NVIDIA_MIG_MONITOR_DEVICES", "all")
1✔
2646
        }
1✔
2647
}
2648

2649
// checks if custom plugin config is provided through a ConfigMap
2650
func isCustomPluginConfigSet(pluginConfig *gpuv1.DevicePluginConfig) bool {
1✔
2651
        if pluginConfig != nil && pluginConfig.Name != "" {
2✔
2652
                return true
1✔
2653
        }
1✔
2654
        return false
1✔
2655
}
2656

2657
// adds shared volume mounts required for custom plugin config provided via a ConfigMap
2658
func addSharedMountsForPluginConfig(container *corev1.Container, config *gpuv1.DevicePluginConfig) {
1✔
2659
        emptyDirMount := corev1.VolumeMount{Name: "config", MountPath: "/config"}
1✔
2660
        configVolMount := corev1.VolumeMount{Name: config.Name, MountPath: "/available-configs"}
1✔
2661

1✔
2662
        container.VolumeMounts = append(container.VolumeMounts, emptyDirMount)
1✔
2663
        container.VolumeMounts = append(container.VolumeMounts, configVolMount)
1✔
2664
}
1✔
2665

2666
// apply spec changes to make custom configurations provided via a ConfigMap available to all containers
2667
func handleDevicePluginConfig(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) error {
1✔
2668
        if !isCustomPluginConfigSet(config.DevicePlugin.Config) {
2✔
2669
                // remove config-manager-init container
1✔
2670
                for i, initContainer := range obj.Spec.Template.Spec.InitContainers {
2✔
2671
                        if initContainer.Name != "config-manager-init" {
2✔
2672
                                continue
1✔
2673
                        }
2674
                        obj.Spec.Template.Spec.InitContainers = append(obj.Spec.Template.Spec.InitContainers[:i], obj.Spec.Template.Spec.InitContainers[i+1:]...)
1✔
2675
                }
2676
                // remove config-manager sidecar container
2677
                for i, container := range obj.Spec.Template.Spec.Containers {
2✔
2678
                        if container.Name != "config-manager" {
2✔
2679
                                continue
1✔
2680
                        }
2681
                        obj.Spec.Template.Spec.Containers = append(obj.Spec.Template.Spec.Containers[:i], obj.Spec.Template.Spec.Containers[i+1:]...)
1✔
2682
                }
2683
                return nil
1✔
2684
        }
2685

2686
        // Apply custom configuration provided through ConfigMap
2687
        // setup env for main container
2688
        for i, container := range obj.Spec.Template.Spec.Containers {
2✔
2689
                switch container.Name {
1✔
2690
                case "nvidia-device-plugin":
1✔
2691
                case "gpu-feature-discovery":
×
2692
                case "mps-control-daemon-ctr":
×
2693
                default:
1✔
2694
                        // skip if not the main container
1✔
2695
                        continue
1✔
2696
                }
2697
                setContainerEnv(&obj.Spec.Template.Spec.Containers[i], "CONFIG_FILE", "/config/config.yaml")
1✔
2698
                // setup sharedvolume(emptydir) for main container
1✔
2699
                addSharedMountsForPluginConfig(&obj.Spec.Template.Spec.Containers[i], config.DevicePlugin.Config)
1✔
2700
        }
2701

2702
        // if hostPID is already set, we skip setting the shareProcessNamespace field
2703
        // for context, go to https://github.com/kubernetes-client/go/blob/master/kubernetes/docs/V1PodSpec.md
2704
        if !obj.Spec.Template.Spec.HostPID {
2✔
2705
                // Enable process ns sharing for PID access
1✔
2706
                shareProcessNamespace := true
1✔
2707
                obj.Spec.Template.Spec.ShareProcessNamespace = &shareProcessNamespace
1✔
2708
        }
1✔
2709
        // setup volumes from configmap and shared emptyDir
2710
        obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, createConfigMapVolume(config.DevicePlugin.Config.Name, nil))
1✔
2711
        obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, createEmptyDirVolume("config"))
1✔
2712

1✔
2713
        // apply env/volume changes to initContainer
1✔
2714
        err := transformConfigManagerInitContainer(obj, config)
1✔
2715
        if err != nil {
1✔
2716
                return err
×
2717
        }
×
2718
        // apply env/volume changes to sidecarContainer
2719
        err = transformConfigManagerSidecarContainer(obj, config)
1✔
2720
        if err != nil {
1✔
2721
                return err
×
2722
        }
×
2723
        return nil
1✔
2724
}
2725

2726
func transformConfigManagerInitContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) error {
1✔
2727
        initContainer := findContainerByName(obj.Spec.Template.Spec.InitContainers, "config-manager-init")
1✔
2728
        if initContainer == nil {
1✔
2729
                // config-manager-init container is not added to the spec, this is a no-op
×
2730
                return nil
×
2731
        }
×
2732
        configManagerImage, err := gpuv1.ImagePath(&config.DevicePlugin)
1✔
2733
        if err != nil {
1✔
2734
                return err
×
2735
        }
×
2736
        initContainer.Image = configManagerImage
1✔
2737
        if config.DevicePlugin.ImagePullPolicy != "" {
1✔
2738
                initContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.DevicePlugin.ImagePullPolicy)
×
2739
        }
×
2740
        // setup env
2741
        setContainerEnv(initContainer, "DEFAULT_CONFIG", config.DevicePlugin.Config.Default)
1✔
2742
        setContainerEnv(initContainer, "FALLBACK_STRATEGIES", "empty")
1✔
2743

1✔
2744
        // setup volume mounts
1✔
2745
        addSharedMountsForPluginConfig(initContainer, config.DevicePlugin.Config)
1✔
2746
        return nil
1✔
2747
}
2748

2749
func transformConfigManagerSidecarContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) error {
1✔
2750
        var container *corev1.Container
1✔
2751
        for i := range obj.Spec.Template.Spec.Containers {
2✔
2752
                if obj.Spec.Template.Spec.Containers[i].Name != "config-manager" {
2✔
2753
                        continue
1✔
2754
                }
2755
                container = &obj.Spec.Template.Spec.Containers[i]
1✔
2756
        }
2757
        if container == nil {
1✔
2758
                // config-manager-init container is not added to the spec, this is a no-op
×
2759
                return nil
×
2760
        }
×
2761
        configManagerImage, err := gpuv1.ImagePath(&config.DevicePlugin)
1✔
2762
        if err != nil {
1✔
2763
                return err
×
2764
        }
×
2765
        container.Image = configManagerImage
1✔
2766
        if config.DevicePlugin.ImagePullPolicy != "" {
1✔
2767
                container.ImagePullPolicy = gpuv1.ImagePullPolicy(config.DevicePlugin.ImagePullPolicy)
×
2768
        }
×
2769
        // setup env
2770
        setContainerEnv(container, "DEFAULT_CONFIG", config.DevicePlugin.Config.Default)
1✔
2771
        setContainerEnv(container, "FALLBACK_STRATEGIES", "empty")
1✔
2772

1✔
2773
        // setup volume mounts
1✔
2774
        addSharedMountsForPluginConfig(container, config.DevicePlugin.Config)
1✔
2775
        return nil
1✔
2776
}
2777

2778
func transformDriverManagerInitContainer(obj *appsv1.DaemonSet, driverManagerSpec *gpuv1.DriverManagerSpec, rdmaSpec *gpuv1.GPUDirectRDMASpec) error {
1✔
2779
        container := findContainerByName(obj.Spec.Template.Spec.InitContainers, "k8s-driver-manager")
1✔
2780

1✔
2781
        if container == nil {
1✔
2782
                return fmt.Errorf("failed to find k8s-driver-manager initContainer in spec")
×
2783
        }
×
2784

2785
        managerImage, err := gpuv1.ImagePath(driverManagerSpec)
1✔
2786
        if err != nil {
1✔
2787
                return err
×
2788
        }
×
2789
        container.Image = managerImage
1✔
2790

1✔
2791
        if driverManagerSpec.ImagePullPolicy != "" {
2✔
2792
                container.ImagePullPolicy = gpuv1.ImagePullPolicy(driverManagerSpec.ImagePullPolicy)
1✔
2793
        }
1✔
2794

2795
        if rdmaSpec != nil && rdmaSpec.IsEnabled() {
2✔
2796
                setContainerEnv(container, GPUDirectRDMAEnabledEnvName, "true")
1✔
2797
                if rdmaSpec.IsHostMOFED() {
2✔
2798
                        setContainerEnv(container, UseHostMOFEDEnvName, "true")
1✔
2799
                }
1✔
2800
        }
2801

2802
        // set/append environment variables for driver-manager initContainer
2803
        if len(driverManagerSpec.Env) > 0 {
2✔
2804
                for _, env := range driverManagerSpec.Env {
2✔
2805
                        setContainerEnv(container, env.Name, env.Value)
1✔
2806
                }
1✔
2807
        }
2808

2809
        // add any pull secrets needed for driver-manager image
2810
        if len(driverManagerSpec.ImagePullSecrets) > 0 {
2✔
2811
                addPullSecrets(&obj.Spec.Template.Spec, driverManagerSpec.ImagePullSecrets)
1✔
2812
        }
1✔
2813

2814
        return nil
1✔
2815
}
2816

2817
func transformPeerMemoryContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
2818
        for i, container := range obj.Spec.Template.Spec.Containers {
2✔
2819
                // skip if not nvidia-peermem
1✔
2820
                if !strings.Contains(container.Name, "nvidia-peermem") {
2✔
2821
                        continue
1✔
2822
                }
2823
                if config.Driver.GPUDirectRDMA == nil || !config.Driver.GPUDirectRDMA.IsEnabled() {
2✔
2824
                        // remove nvidia-peermem sidecar container from driver Daemonset if RDMA is not enabled
1✔
2825
                        obj.Spec.Template.Spec.Containers = append(obj.Spec.Template.Spec.Containers[:i], obj.Spec.Template.Spec.Containers[i+1:]...)
1✔
2826
                        return nil
1✔
2827
                }
1✔
2828
                // update nvidia-peermem driver image and pull policy to be same as gpu-driver image
2829
                // as its installed as part of gpu-driver image
2830
                driverImage, err := resolveDriverTag(n, &config.Driver)
1✔
2831
                if err != nil {
1✔
2832
                        return err
×
2833
                }
×
2834
                if driverImage != "" {
2✔
2835
                        obj.Spec.Template.Spec.Containers[i].Image = driverImage
1✔
2836
                }
1✔
2837
                if config.Driver.ImagePullPolicy != "" {
1✔
2838
                        obj.Spec.Template.Spec.Containers[i].ImagePullPolicy = gpuv1.ImagePullPolicy(config.Driver.ImagePullPolicy)
×
2839
                }
×
2840
                if config.Driver.GPUDirectRDMA.UseHostMOFED != nil && *config.Driver.GPUDirectRDMA.UseHostMOFED {
2✔
2841
                        // set env indicating host-mofed is enabled
1✔
2842
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[i]), UseHostMOFEDEnvName, "true")
1✔
2843
                }
1✔
2844
                // mount any custom kernel module configuration parameters at /drivers
2845
                if config.Driver.KernelModuleConfig != nil && config.Driver.KernelModuleConfig.Name != "" {
1✔
2846
                        // note: transformDriverContainer() will have already created a Volume backed by the ConfigMap.
×
2847
                        // Only add a VolumeMount for nvidia-peermem-ctr.
×
2848
                        destinationDir := "/drivers"
×
2849
                        volumeMounts, _, err := createConfigMapVolumeMounts(n, config.Driver.KernelModuleConfig.Name, destinationDir)
×
2850
                        if err != nil {
×
2851
                                return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for kernel module configuration: %v", err)
×
2852
                        }
×
2853
                        obj.Spec.Template.Spec.Containers[i].VolumeMounts = append(obj.Spec.Template.Spec.Containers[i].VolumeMounts, volumeMounts...)
×
2854
                }
2855
                if config.Driver.Resources != nil {
1✔
2856
                        obj.Spec.Template.Spec.Containers[i].Resources = corev1.ResourceRequirements{
×
2857
                                Requests: config.Driver.Resources.Requests,
×
2858
                                Limits:   config.Driver.Resources.Limits,
×
2859
                        }
×
2860
                }
×
2861
        }
2862
        return nil
1✔
2863
}
2864

2865
// check if running with openshift and add an ENV VAR to the OCP DTK CTR
2866
func transformGDSContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
2867
        for i, container := range obj.Spec.Template.Spec.Containers {
2✔
2868
                // skip if not nvidia-fs
1✔
2869
                if !strings.Contains(container.Name, "nvidia-fs") {
2✔
2870
                        continue
1✔
2871
                }
2872
                if config.GPUDirectStorage == nil || !config.GPUDirectStorage.IsEnabled() {
2✔
2873
                        n.logger.Info("GPUDirect Storage is disabled")
1✔
2874
                        // remove nvidia-fs sidecar container from driver Daemonset if GDS is not enabled
1✔
2875
                        obj.Spec.Template.Spec.Containers = append(obj.Spec.Template.Spec.Containers[:i], obj.Spec.Template.Spec.Containers[i+1:]...)
1✔
2876
                        return nil
1✔
2877
                }
1✔
2878
                if config.Driver.UsePrecompiledDrivers() {
1✔
2879
                        return fmt.Errorf("GPUDirect Storage driver (nvidia-fs) is not supported along with pre-compiled NVIDIA drivers")
×
2880
                }
×
2881

2882
                gdsContainer := &obj.Spec.Template.Spec.Containers[i]
1✔
2883

1✔
2884
                // update nvidia-fs(sidecar) image and pull policy
1✔
2885
                gdsImage, err := resolveDriverTag(n, config.GPUDirectStorage)
1✔
2886
                if err != nil {
1✔
2887
                        return err
×
2888
                }
×
2889
                if gdsImage != "" {
2✔
2890
                        gdsContainer.Image = gdsImage
1✔
2891
                }
1✔
2892
                if config.GPUDirectStorage.ImagePullPolicy != "" {
1✔
2893
                        gdsContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.GPUDirectStorage.ImagePullPolicy)
×
2894
                }
×
2895

2896
                // set image pull secrets
2897
                if len(config.GPUDirectStorage.ImagePullSecrets) > 0 {
1✔
2898
                        addPullSecrets(&obj.Spec.Template.Spec, config.GPUDirectStorage.ImagePullSecrets)
×
2899
                }
×
2900

2901
                // set/append environment variables for GDS container
2902
                if len(config.GPUDirectStorage.Env) > 0 {
1✔
2903
                        for _, env := range config.GPUDirectStorage.Env {
×
2904
                                setContainerEnv(gdsContainer, env.Name, env.Value)
×
2905
                        }
×
2906
                }
2907

2908
                if config.Driver.RepoConfig != nil && config.Driver.RepoConfig.ConfigMapName != "" {
1✔
2909
                        // note: transformDriverContainer() will have already created a Volume backed by the ConfigMap.
×
2910
                        // Only add a VolumeMount for nvidia-fs-ctr.
×
2911
                        destinationDir, err := getRepoConfigPath()
×
2912
                        if err != nil {
×
2913
                                return fmt.Errorf("ERROR: failed to get destination directory for custom repo config: %w", err)
×
2914
                        }
×
2915
                        volumeMounts, _, err := createConfigMapVolumeMounts(n, config.Driver.RepoConfig.ConfigMapName, destinationDir)
×
2916
                        if err != nil {
×
2917
                                return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for custom package repo config: %w", err)
×
2918
                        }
×
2919
                        gdsContainer.VolumeMounts = append(gdsContainer.VolumeMounts, volumeMounts...)
×
2920
                }
2921

2922
                // set any custom ssl key/certificate configuration provided
2923
                if config.Driver.CertConfig != nil && config.Driver.CertConfig.Name != "" {
1✔
2924
                        destinationDir, err := getCertConfigPath()
×
2925
                        if err != nil {
×
2926
                                return fmt.Errorf("ERROR: failed to get destination directory for ssl key/cert config: %w", err)
×
2927
                        }
×
2928
                        volumeMounts, _, err := createConfigMapVolumeMounts(n, config.Driver.CertConfig.Name, destinationDir)
×
2929
                        if err != nil {
×
2930
                                return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for custom certs: %w", err)
×
2931
                        }
×
2932
                        gdsContainer.VolumeMounts = append(gdsContainer.VolumeMounts, volumeMounts...)
×
2933
                }
2934

2935
                secretName := config.Driver.SecretEnv
1✔
2936
                if len(secretName) > 0 {
2✔
2937
                        err := createSecretEnvReference(n.ctx, n.client, secretName, n.operatorNamespace, gdsContainer)
1✔
2938
                        if err != nil {
1✔
2939
                                return fmt.Errorf("ERROR: failed to attach secret %s to the driver container: %w", secretName, err)
×
2940
                        }
×
2941
                }
2942

2943
                // transform the nvidia-fs-ctr to use the openshift driver toolkit
2944
                // notify openshift driver toolkit container GDS is enabled
2945
                err = transformOpenShiftDriverToolkitContainer(obj, config, n, "nvidia-fs-ctr")
1✔
2946
                if err != nil {
1✔
2947
                        return fmt.Errorf("ERROR: failed to transform the Driver Toolkit Container: %s", err)
×
2948
                }
×
2949
                if config.Driver.Resources != nil {
2✔
2950
                        gdsContainer.Resources = corev1.ResourceRequirements{
1✔
2951
                                Requests: config.Driver.Resources.Requests,
1✔
2952
                                Limits:   config.Driver.Resources.Limits,
1✔
2953
                        }
1✔
2954
                }
1✔
2955
        }
2956
        return nil
1✔
2957
}
2958

2959
func transformGDRCopyContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
2960
        for i, container := range obj.Spec.Template.Spec.Containers {
2✔
2961
                // skip if not nvidia-gdrcopy
1✔
2962
                if !strings.HasPrefix(container.Name, "nvidia-gdrcopy") {
2✔
2963
                        continue
1✔
2964
                }
2965
                if config.GDRCopy == nil || !config.GDRCopy.IsEnabled() {
2✔
2966
                        n.logger.Info("GDRCopy is disabled")
1✔
2967
                        // remove nvidia-gdrcopy sidecar container from driver Daemonset if gdrcopy is not enabled
1✔
2968
                        obj.Spec.Template.Spec.Containers = append(obj.Spec.Template.Spec.Containers[:i], obj.Spec.Template.Spec.Containers[i+1:]...)
1✔
2969
                        return nil
1✔
2970
                }
1✔
2971
                if config.Driver.UsePrecompiledDrivers() {
1✔
2972
                        return fmt.Errorf("GDRCopy is not supported along with pre-compiled NVIDIA drivers")
×
2973
                }
×
2974

2975
                gdrcopyContainer := &obj.Spec.Template.Spec.Containers[i]
1✔
2976

1✔
2977
                // update nvidia-gdrcopy image and pull policy
1✔
2978
                gdrcopyImage, err := resolveDriverTag(n, config.GDRCopy)
1✔
2979
                if err != nil {
1✔
2980
                        return err
×
2981
                }
×
2982
                if gdrcopyImage != "" {
2✔
2983
                        gdrcopyContainer.Image = gdrcopyImage
1✔
2984
                }
1✔
2985
                if config.GDRCopy.ImagePullPolicy != "" {
1✔
2986
                        gdrcopyContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.GDRCopy.ImagePullPolicy)
×
2987
                }
×
2988

2989
                // set image pull secrets
2990
                if len(config.GDRCopy.ImagePullSecrets) > 0 {
1✔
2991
                        addPullSecrets(&obj.Spec.Template.Spec, config.GDRCopy.ImagePullSecrets)
×
2992
                }
×
2993

2994
                // set/append environment variables for gdrcopy container
2995
                if len(config.GDRCopy.Env) > 0 {
1✔
2996
                        for _, env := range config.GDRCopy.Env {
×
2997
                                setContainerEnv(gdrcopyContainer, env.Name, env.Value)
×
2998
                        }
×
2999
                }
3000

3001
                if config.Driver.RepoConfig != nil && config.Driver.RepoConfig.ConfigMapName != "" {
1✔
3002
                        // note: transformDriverContainer() will have already created a Volume backed by the ConfigMap.
×
3003
                        // Only add a VolumeMount for nvidia-gdrcopy-ctr.
×
3004
                        destinationDir, err := getRepoConfigPath()
×
3005
                        if err != nil {
×
3006
                                return fmt.Errorf("ERROR: failed to get destination directory for custom repo config: %w", err)
×
3007
                        }
×
3008
                        volumeMounts, _, err := createConfigMapVolumeMounts(n, config.Driver.RepoConfig.ConfigMapName, destinationDir)
×
3009
                        if err != nil {
×
3010
                                return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for custom package repo config: %w", err)
×
3011
                        }
×
3012
                        gdrcopyContainer.VolumeMounts = append(gdrcopyContainer.VolumeMounts, volumeMounts...)
×
3013
                }
3014

3015
                // set any custom ssl key/certificate configuration provided
3016
                if config.Driver.CertConfig != nil && config.Driver.CertConfig.Name != "" {
1✔
3017
                        destinationDir, err := getCertConfigPath()
×
3018
                        if err != nil {
×
3019
                                return fmt.Errorf("ERROR: failed to get destination directory for ssl key/cert config: %w", err)
×
3020
                        }
×
3021
                        volumeMounts, _, err := createConfigMapVolumeMounts(n, config.Driver.CertConfig.Name, destinationDir)
×
3022
                        if err != nil {
×
3023
                                return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for custom certs: %w", err)
×
3024
                        }
×
3025
                        gdrcopyContainer.VolumeMounts = append(gdrcopyContainer.VolumeMounts, volumeMounts...)
×
3026
                }
3027

3028
                secretName := config.Driver.SecretEnv
1✔
3029
                if len(secretName) > 0 {
2✔
3030
                        err := createSecretEnvReference(n.ctx, n.client, secretName, n.operatorNamespace, gdrcopyContainer)
1✔
3031
                        if err != nil {
1✔
3032
                                return fmt.Errorf("ERROR: failed to attach secret %s to the driver container: %w", secretName, err)
×
3033
                        }
×
3034
                }
3035

3036
                // transform the nvidia-gdrcopy-ctr to use the openshift driver toolkit
3037
                // notify openshift driver toolkit container that gdrcopy is enabled
3038
                err = transformOpenShiftDriverToolkitContainer(obj, config, n, "nvidia-gdrcopy-ctr")
1✔
3039
                if err != nil {
1✔
3040
                        return fmt.Errorf("ERROR: failed to transform the Driver Toolkit Container: %w", err)
×
3041
                }
×
3042
                if config.Driver.Resources != nil {
2✔
3043
                        gdrcopyContainer.Resources = corev1.ResourceRequirements{
1✔
3044
                                Requests: config.Driver.Resources.Requests,
1✔
3045
                                Limits:   config.Driver.Resources.Limits,
1✔
3046
                        }
1✔
3047
                }
1✔
3048
        }
3049
        return nil
1✔
3050
}
3051

3052
// getSanitizedKernelVersion returns kernelVersion with following changes
3053
// 1. Remove arch suffix (as we use multi-arch images) and
3054
// 2. ensure to meet k8s constraints for metadata.name, i.e it
3055
// must consist of lower case alphanumeric characters, '-' or '.', and must start and end with an alphanumeric character
3056
func getSanitizedKernelVersion(kernelVersion string) string {
1✔
3057
        archRegex := regexp.MustCompile("x86_64(?:_64k)?|aarch64(?:_64k)?")
1✔
3058
        // remove arch strings, "_" and any trailing "." from the kernel version
1✔
3059
        sanitizedVersion := strings.TrimSuffix(strings.ReplaceAll(archRegex.ReplaceAllString(kernelVersion, ""), "_", "."), ".")
1✔
3060
        return strings.ToLower(sanitizedVersion)
1✔
3061
}
1✔
3062

3063
func transformPrecompiledDriverDaemonset(obj *appsv1.DaemonSet, n ClusterPolicyController) (err error) {
1✔
3064
        sanitizedVersion := getSanitizedKernelVersion(n.currentKernelVersion)
1✔
3065
        // prepare the DaemonSet to be kernel-version specific
1✔
3066
        obj.Name += "-" + sanitizedVersion + "-" + n.kernelVersionMap[n.currentKernelVersion]
1✔
3067

1✔
3068
        // add unique labels for each kernel-version specific Daemonset
1✔
3069
        obj.Labels[precompiledIdentificationLabelKey] = precompiledIdentificationLabelValue
1✔
3070
        obj.Spec.Template.Labels[precompiledIdentificationLabelKey] = precompiledIdentificationLabelValue
1✔
3071

1✔
3072
        // append kernel-version specific node-selector
1✔
3073
        obj.Spec.Template.Spec.NodeSelector[nfdKernelLabelKey] = n.currentKernelVersion
1✔
3074
        return nil
1✔
3075
}
1✔
3076

3077
func transformOpenShiftDriverToolkitContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController, operandContainerName string) error {
1✔
3078
        var err error
1✔
3079

1✔
3080
        getContainer := func(name string, remove bool) (*corev1.Container, error) {
2✔
3081
                for i, container := range obj.Spec.Template.Spec.Containers {
2✔
3082
                        if container.Name != name {
2✔
3083
                                continue
1✔
3084
                        }
3085
                        if !remove {
1✔
3086
                                return &obj.Spec.Template.Spec.Containers[i], nil
×
3087
                        }
×
3088

3089
                        obj.Spec.Template.Spec.Containers = append(obj.Spec.Template.Spec.Containers[:i],
1✔
3090
                                obj.Spec.Template.Spec.Containers[i+1:]...)
1✔
3091
                        return nil, nil
1✔
3092
                }
3093

3094
                // if a container is not found, then it must have been removed already, return success
3095
                if remove {
2✔
3096
                        return nil, nil
1✔
3097
                }
1✔
3098

3099
                return nil, fmt.Errorf("could not find the '%s' container", name)
×
3100
        }
3101

3102
        if !n.ocpDriverToolkit.enabled {
2✔
3103
                if n.ocpDriverToolkit.requested {
1✔
3104
                        n.logger.Info("OpenShift DriverToolkit was requested but could not be enabled (dependencies missing)")
×
3105
                }
×
3106

3107
                /* remove OpenShift Driver Toolkit side-car container from the Driver DaemonSet */
3108
                _, err = getContainer("openshift-driver-toolkit-ctr", true)
1✔
3109
                return err
1✔
3110
        }
3111

3112
        /* find the main container and driver-toolkit sidecar container */
3113
        var operandMainContainer, driverToolkitContainer *corev1.Container
×
3114
        if operandMainContainer, err = getContainer(operandContainerName, false); err != nil {
×
3115
                return err
×
3116
        }
×
3117

3118
        if driverToolkitContainer, err = getContainer("openshift-driver-toolkit-ctr", false); err != nil {
×
3119
                return err
×
3120
        }
×
3121

3122
        /* prepare the DaemonSet to be RHCOS-version specific */
3123
        rhcosVersion := n.ocpDriverToolkit.currentRhcosVersion
×
3124

×
3125
        if !strings.Contains(obj.Name, rhcosVersion) {
×
3126
                obj.Name += "-" + rhcosVersion
×
3127
        }
×
3128
        obj.Labels["app"] = obj.Name
×
3129
        obj.Spec.Selector.MatchLabels["app"] = obj.Name
×
3130
        obj.Spec.Template.Labels["app"] = obj.Name
×
3131

×
3132
        obj.Labels[ocpDriverToolkitVersionLabel] = rhcosVersion
×
3133
        obj.Spec.Template.Spec.NodeSelector[nfdOSTreeVersionLabelKey] = rhcosVersion
×
3134

×
3135
        /* prepare the DaemonSet to be searchable */
×
3136
        obj.Labels[ocpDriverToolkitIdentificationLabel] = ocpDriverToolkitIdentificationValue
×
3137
        obj.Spec.Template.Labels[ocpDriverToolkitIdentificationLabel] = ocpDriverToolkitIdentificationValue
×
3138

×
3139
        /* prepare the DriverToolkit container */
×
3140
        setContainerEnv(driverToolkitContainer, "RHCOS_VERSION", rhcosVersion)
×
3141

×
3142
        if config.GPUDirectStorage != nil && config.GPUDirectStorage.IsEnabled() {
×
3143
                setContainerEnv(driverToolkitContainer, "GDS_ENABLED", "true")
×
3144
                n.logger.V(2).Info("transformOpenShiftDriverToolkitContainer", "GDS_ENABLED", config.GPUDirectStorage.IsEnabled())
×
3145
        }
×
3146

3147
        if config.GDRCopy != nil && config.GDRCopy.IsEnabled() {
×
3148
                setContainerEnv(driverToolkitContainer, "GDRCOPY_ENABLED", "true")
×
3149
                n.logger.V(2).Info("transformOpenShiftDriverToolkitContainer", "GDRCOPY_ENABLED", "true")
×
3150
        }
×
3151

3152
        image := n.ocpDriverToolkit.rhcosDriverToolkitImages[n.ocpDriverToolkit.currentRhcosVersion]
×
3153
        if image != "" {
×
3154
                driverToolkitContainer.Image = image
×
3155
                n.logger.Info("DriverToolkit", "image", driverToolkitContainer.Image)
×
3156
        } else {
×
3157
                /* RHCOS tag missing in the Driver-Toolkit imagestream, setup fallback */
×
3158
                obj.Labels["openshift.driver-toolkit.rhcos-image-missing"] = "true"
×
3159
                obj.Spec.Template.Labels["openshift.driver-toolkit.rhcos-image-missing"] = "true"
×
3160

×
3161
                driverToolkitContainer.Image = operandMainContainer.Image
×
3162
                setContainerEnv(operandMainContainer, "RHCOS_IMAGE_MISSING", "true")
×
3163
                setContainerEnv(operandMainContainer, "RHCOS_VERSION", rhcosVersion)
×
3164
                setContainerEnv(driverToolkitContainer, "RHCOS_IMAGE_MISSING", "true")
×
3165

×
3166
                n.logger.Info("WARNING: DriverToolkit image tag missing. Version-specific fallback mode enabled.", "rhcosVersion", rhcosVersion)
×
3167
        }
×
3168

3169
        /* prepare the main container to start from the DriverToolkit entrypoint */
3170
        switch operandContainerName {
×
3171
        case "nvidia-fs-ctr":
×
3172
                operandMainContainer.Command = []string{"ocp_dtk_entrypoint"}
×
3173
                operandMainContainer.Args = []string{"nv-fs-ctr-run-with-dtk"}
×
3174
        case "nvidia-gdrcopy-ctr":
×
3175
                operandMainContainer.Command = []string{"ocp_dtk_entrypoint"}
×
3176
                operandMainContainer.Args = []string{"gdrcopy-ctr-run-with-dtk"}
×
3177
        default:
×
3178
                operandMainContainer.Command = []string{"ocp_dtk_entrypoint"}
×
3179
                operandMainContainer.Args = []string{"nv-ctr-run-with-dtk"}
×
3180
        }
3181

3182
        /* prepare the shared volumes */
3183
        // shared directory
3184
        volSharedDirName, volSharedDirPath := "shared-nvidia-driver-toolkit", "/mnt/shared-nvidia-driver-toolkit"
×
3185

×
3186
        volMountSharedDir := corev1.VolumeMount{Name: volSharedDirName, MountPath: volSharedDirPath}
×
3187
        operandMainContainer.VolumeMounts = append(operandMainContainer.VolumeMounts, volMountSharedDir)
×
3188

×
3189
        volSharedDir := corev1.Volume{
×
3190
                Name: volSharedDirName,
×
3191
                VolumeSource: corev1.VolumeSource{
×
3192
                        EmptyDir: &corev1.EmptyDirVolumeSource{},
×
3193
                },
×
3194
        }
×
3195

×
3196
        // Check if the volume already exists, if not add it
×
3197
        for i := range obj.Spec.Template.Spec.Volumes {
×
3198
                if obj.Spec.Template.Spec.Volumes[i].Name == volSharedDirName {
×
3199
                        // already exists, avoid duplicated volume
×
3200
                        return nil
×
3201
                }
×
3202
        }
3203
        obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, volSharedDir)
×
3204

×
3205
        // set resource limits
×
3206
        if config.Driver.Resources != nil {
×
3207
                driverToolkitContainer.Resources = corev1.ResourceRequirements{
×
3208
                        Requests: config.Driver.Resources.Requests,
×
3209
                        Limits:   config.Driver.Resources.Limits,
×
3210
                }
×
3211
        }
×
3212
        return nil
×
3213
}
3214

3215
// resolveDriverTag resolves image tag based on the OS of the worker node
3216
func resolveDriverTag(n ClusterPolicyController, driverSpec interface{}) (string, error) {
1✔
3217
        // obtain os version
1✔
3218
        kvers, osTag, _ := kernelFullVersion(n)
1✔
3219
        if kvers == "" {
1✔
3220
                return "", fmt.Errorf("ERROR: Could not find kernel full version: ('%s', '%s')", kvers, osTag)
×
3221
        }
×
3222

3223
        // obtain image path
3224
        var image string
1✔
3225
        var err error
1✔
3226
        switch v := driverSpec.(type) {
1✔
3227
        case *gpuv1.DriverSpec:
1✔
3228
                spec := driverSpec.(*gpuv1.DriverSpec)
1✔
3229
                // check if this is pre-compiled driver deployment.
1✔
3230
                if spec.UsePrecompiledDrivers() {
2✔
3231
                        if spec.Repository == "" && spec.Version == "" {
1✔
3232
                                if spec.Image != "" {
×
3233
                                        // this is useful for tools like kbld(carvel) which will just specify driver.image param as path:version
×
3234
                                        image = spec.Image + "-" + n.currentKernelVersion
×
3235
                                } else {
×
3236
                                        return "", fmt.Errorf("unable to resolve driver image path for pre-compiled drivers, driver.repository, driver.image and driver.version have to be specified in the ClusterPolicy")
×
3237
                                }
×
3238
                        } else {
1✔
3239
                                // use per kernel version tag
1✔
3240
                                image = spec.Repository + "/" + spec.Image + ":" + spec.Version + "-" + n.currentKernelVersion
1✔
3241
                        }
1✔
3242
                } else {
1✔
3243
                        image, err = gpuv1.ImagePath(spec)
1✔
3244
                        if err != nil {
1✔
3245
                                return "", err
×
3246
                        }
×
3247
                }
3248
        case *gpuv1.GPUDirectStorageSpec:
1✔
3249
                spec := driverSpec.(*gpuv1.GPUDirectStorageSpec)
1✔
3250
                image, err = gpuv1.ImagePath(spec)
1✔
3251
                if err != nil {
1✔
3252
                        return "", err
×
3253
                }
×
3254
        case *gpuv1.VGPUManagerSpec:
1✔
3255
                spec := driverSpec.(*gpuv1.VGPUManagerSpec)
1✔
3256
                image, err = gpuv1.ImagePath(spec)
1✔
3257
                if err != nil {
1✔
3258
                        return "", err
×
3259
                }
×
3260
        case *gpuv1.GDRCopySpec:
1✔
3261
                spec := driverSpec.(*gpuv1.GDRCopySpec)
1✔
3262
                image, err = gpuv1.ImagePath(spec)
1✔
3263
                if err != nil {
1✔
3264
                        return "", err
×
3265
                }
×
3266
        default:
×
3267
                return "", fmt.Errorf("invalid type to construct image path: %v", v)
×
3268
        }
3269

3270
        // if image digest is specified, use it directly
3271
        if !strings.Contains(image, "sha256:") {
2✔
3272
                // append os-tag to the provided driver version
1✔
3273
                image = fmt.Sprintf("%s-%s", image, osTag)
1✔
3274
        }
1✔
3275
        return image, nil
1✔
3276
}
3277

3278
// getRepoConfigPath returns the standard OS specific path for repository configuration files
3279
func getRepoConfigPath() (string, error) {
×
3280
        release, err := parseOSRelease()
×
3281
        if err != nil {
×
3282
                return "", err
×
3283
        }
×
3284

3285
        os := release["ID"]
×
3286
        if path, ok := RepoConfigPathMap[os]; ok {
×
3287
                return path, nil
×
3288
        }
×
3289
        return "", fmt.Errorf("distribution not supported")
×
3290
}
3291

3292
// getCertConfigPath returns the standard OS specific path for ssl keys/certificates
3293
func getCertConfigPath() (string, error) {
×
3294
        release, err := parseOSRelease()
×
3295
        if err != nil {
×
3296
                return "", err
×
3297
        }
×
3298

3299
        os := release["ID"]
×
3300
        if path, ok := CertConfigPathMap[os]; ok {
×
3301
                return path, nil
×
3302
        }
×
3303
        return "", fmt.Errorf("distribution not supported")
×
3304
}
3305

3306
// getSubscriptionPathsToVolumeSources returns the MountPathToVolumeSource map containing all
3307
// OS-specific subscription/entitlement paths that need to be mounted in the container.
3308
func getSubscriptionPathsToVolumeSources() (MountPathToVolumeSource, error) {
×
3309
        release, err := parseOSRelease()
×
3310
        if err != nil {
×
3311
                return nil, err
×
3312
        }
×
3313

3314
        os := release["ID"]
×
3315
        if pathToVolumeSource, ok := SubscriptionPathMap[os]; ok {
×
3316
                return pathToVolumeSource, nil
×
3317
        }
×
3318
        return nil, fmt.Errorf("distribution not supported")
×
3319
}
3320

3321
// createConfigMapVolumeMounts creates a VolumeMount for each key
3322
// in the ConfigMap. Use subPath to ensure original contents
3323
// at destinationDir are not overwritten.
3324
func createConfigMapVolumeMounts(n ClusterPolicyController, configMapName string, destinationDir string) ([]corev1.VolumeMount, []corev1.KeyToPath, error) {
×
3325
        ctx := n.ctx
×
3326
        // get the ConfigMap
×
3327
        cm := &corev1.ConfigMap{}
×
3328
        opts := client.ObjectKey{Namespace: n.operatorNamespace, Name: configMapName}
×
3329
        err := n.client.Get(ctx, opts, cm)
×
3330
        if err != nil {
×
3331
                return nil, nil, fmt.Errorf("ERROR: could not get ConfigMap %s from client: %v", configMapName, err)
×
3332
        }
×
3333

3334
        // create one volume mount per file in the ConfigMap and use subPath
3335
        var filenames []string
×
3336
        for filename := range cm.Data {
×
3337
                filenames = append(filenames, filename)
×
3338
        }
×
3339
        // sort so volume mounts are added to spec in deterministic order
3340
        sort.Strings(filenames)
×
3341
        var itemsToInclude []corev1.KeyToPath
×
3342
        var volumeMounts []corev1.VolumeMount
×
3343
        for _, filename := range filenames {
×
3344
                volumeMounts = append(volumeMounts,
×
3345
                        corev1.VolumeMount{Name: configMapName, ReadOnly: true, MountPath: filepath.Join(destinationDir, filename), SubPath: filename})
×
3346
                itemsToInclude = append(itemsToInclude, corev1.KeyToPath{
×
3347
                        Key:  filename,
×
3348
                        Path: filename,
×
3349
                })
×
3350
        }
×
3351
        return volumeMounts, itemsToInclude, nil
×
3352
}
3353

3354
func createConfigMapVolume(configMapName string, itemsToInclude []corev1.KeyToPath) corev1.Volume {
1✔
3355
        volumeSource := corev1.VolumeSource{
1✔
3356
                ConfigMap: &corev1.ConfigMapVolumeSource{
1✔
3357
                        LocalObjectReference: corev1.LocalObjectReference{
1✔
3358
                                Name: configMapName,
1✔
3359
                        },
1✔
3360
                        Items: itemsToInclude,
1✔
3361
                },
1✔
3362
        }
1✔
3363
        return corev1.Volume{Name: configMapName, VolumeSource: volumeSource}
1✔
3364
}
1✔
3365

3366
func createEmptyDirVolume(volumeName string) corev1.Volume {
1✔
3367
        return corev1.Volume{
1✔
3368
                Name: volumeName,
1✔
3369
                VolumeSource: corev1.VolumeSource{
1✔
3370
                        EmptyDir: &corev1.EmptyDirVolumeSource{},
1✔
3371
                },
1✔
3372
        }
1✔
3373
}
1✔
3374

3375
func applyLicensingConfig(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, driverContainer *corev1.Container) {
1✔
3376
        podSpec := &obj.Spec.Template.Spec
1✔
3377

1✔
3378
        // add new volume mount
1✔
3379
        licensingConfigVolMount := corev1.VolumeMount{Name: "licensing-config", ReadOnly: true, MountPath: consts.VGPULicensingConfigMountPath, SubPath: consts.VGPULicensingFileName}
1✔
3380
        driverContainer.VolumeMounts = append(driverContainer.VolumeMounts, licensingConfigVolMount)
1✔
3381

1✔
3382
        // gridd.conf always mounted
1✔
3383
        licenseItemsToInclude := []corev1.KeyToPath{
1✔
3384
                {
1✔
3385
                        Key:  consts.VGPULicensingFileName,
1✔
3386
                        Path: consts.VGPULicensingFileName,
1✔
3387
                },
1✔
3388
        }
1✔
3389
        // client config token only mounted when NLS is enabled
1✔
3390
        if config.Driver.LicensingConfig.IsNLSEnabled() {
1✔
3391
                licenseItemsToInclude = append(licenseItemsToInclude, corev1.KeyToPath{
×
3392
                        Key:  consts.NLSClientTokenFileName,
×
3393
                        Path: consts.NLSClientTokenFileName,
×
3394
                })
×
3395
                nlsTokenVolMount := corev1.VolumeMount{Name: "licensing-config", ReadOnly: true, MountPath: consts.NLSClientTokenMountPath, SubPath: consts.NLSClientTokenFileName}
×
3396
                driverContainer.VolumeMounts = append(driverContainer.VolumeMounts, nlsTokenVolMount)
×
3397
        }
×
3398

3399
        var licensingConfigVolumeSource corev1.VolumeSource
1✔
3400
        if config.Driver.LicensingConfig.SecretName != "" {
2✔
3401
                licensingConfigVolumeSource = corev1.VolumeSource{
1✔
3402
                        Secret: &corev1.SecretVolumeSource{
1✔
3403
                                SecretName: config.Driver.LicensingConfig.SecretName,
1✔
3404
                                Items:      licenseItemsToInclude,
1✔
3405
                        },
1✔
3406
                }
1✔
3407
        } else if config.Driver.LicensingConfig.ConfigMapName != "" {
3✔
3408
                licensingConfigVolumeSource = corev1.VolumeSource{
1✔
3409
                        ConfigMap: &corev1.ConfigMapVolumeSource{
1✔
3410
                                LocalObjectReference: corev1.LocalObjectReference{
1✔
3411
                                        Name: config.Driver.LicensingConfig.ConfigMapName,
1✔
3412
                                },
1✔
3413
                                Items: licenseItemsToInclude,
1✔
3414
                        },
1✔
3415
                }
1✔
3416
        }
1✔
3417
        licensingConfigVol := corev1.Volume{Name: "licensing-config", VolumeSource: licensingConfigVolumeSource}
1✔
3418
        podSpec.Volumes = append(podSpec.Volumes, licensingConfigVol)
1✔
3419
}
3420

3421
func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
3422
        podSpec := &obj.Spec.Template.Spec
1✔
3423
        driverContainer := findContainerByName(podSpec.Containers, "nvidia-driver-ctr")
1✔
3424
        if driverContainer == nil {
1✔
3425
                return fmt.Errorf("driver container (nvidia-driver-ctr) is missing from the driver daemonset manifest")
×
3426
        }
×
3427

3428
        image, err := resolveDriverTag(n, &config.Driver)
1✔
3429
        if err != nil {
1✔
3430
                return err
×
3431
        }
×
3432
        if image != "" {
2✔
3433
                driverContainer.Image = image
1✔
3434
        }
1✔
3435

3436
        // update image pull policy
3437
        driverContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.Driver.ImagePullPolicy)
1✔
3438

1✔
3439
        // set image pull secrets
1✔
3440
        if len(config.Driver.ImagePullSecrets) > 0 {
2✔
3441
                addPullSecrets(&obj.Spec.Template.Spec, config.Driver.ImagePullSecrets)
1✔
3442
        }
1✔
3443
        // set resource limits
3444
        if config.Driver.Resources != nil {
2✔
3445
                driverContainer.Resources.Requests = config.Driver.Resources.Requests
1✔
3446
                driverContainer.Resources.Limits = config.Driver.Resources.Limits
1✔
3447
        }
1✔
3448
        // set arguments if specified for driver container
3449
        if len(config.Driver.Args) > 0 {
1✔
3450
                driverContainer.Args = config.Driver.Args
×
3451
        }
×
3452

3453
        if len(config.Driver.KernelModuleType) > 0 {
1✔
3454
                setContainerEnv(driverContainer, KernelModuleTypeEnvName, config.Driver.KernelModuleType)
×
3455
                // we set the "OPEN_KERNEL_MODULES_ENABLED" envar for backwards compatibility with older driver containers
×
3456
                if config.Driver.OpenKernelModulesEnabled() {
×
3457
                        setContainerEnv(driverContainer, OpenKernelModulesEnabledEnvName, "true")
×
3458
                }
×
3459
        }
3460

3461
        if len(config.Driver.DriverType) > 0 {
1✔
NEW
3462
                setContainerEnv(driverContainer, DriverTypeEnvName, config.Driver.DriverType)
×
NEW
3463
        }
×
3464

3465
        // set container probe timeouts
3466
        if config.Driver.StartupProbe != nil {
2✔
3467
                setContainerProbe(driverContainer, config.Driver.StartupProbe, Startup)
1✔
3468
        }
1✔
3469
        if config.Driver.LivenessProbe != nil {
1✔
3470
                setContainerProbe(driverContainer, config.Driver.LivenessProbe, Liveness)
×
3471
        }
×
3472
        if config.Driver.ReadinessProbe != nil {
1✔
3473
                setContainerProbe(driverContainer, config.Driver.ReadinessProbe, Readiness)
×
3474
        }
×
3475

3476
        if config.Driver.GPUDirectRDMA != nil && config.Driver.GPUDirectRDMA.IsEnabled() {
2✔
3477
                // set env indicating nvidia-peermem is enabled to compile module with required ib_* interfaces
1✔
3478
                setContainerEnv(driverContainer, GPUDirectRDMAEnabledEnvName, "true")
1✔
3479
                // check if MOFED drives are directly installed on host and update source path accordingly
1✔
3480
                // to build nvidia-peermem module
1✔
3481
                if config.Driver.GPUDirectRDMA.UseHostMOFED != nil && *config.Driver.GPUDirectRDMA.UseHostMOFED {
2✔
3482
                        // mount /usr/src/ofa_kernel path directly from host to build using MOFED drivers installed on host
1✔
3483
                        for index, volume := range podSpec.Volumes {
1✔
3484
                                if volume.Name == "mlnx-ofed-usr-src" {
×
3485
                                        podSpec.Volumes[index].HostPath.Path = "/usr/src"
×
3486
                                }
×
3487
                        }
3488
                        // set env indicating host-mofed is enabled
3489
                        setContainerEnv(driverContainer, UseHostMOFEDEnvName, "true")
1✔
3490
                }
3491
        }
3492

3493
        // set any licensing configuration required
3494
        if config.Driver.IsVGPULicensingEnabled() {
2✔
3495
                applyLicensingConfig(obj, config, driverContainer)
1✔
3496
        }
1✔
3497

3498
        // set virtual topology daemon configuration if specified for vGPU driver
3499
        if config.Driver.VirtualTopology != nil && config.Driver.VirtualTopology.Config != "" {
1✔
3500
                topologyConfigVolMount := corev1.VolumeMount{Name: "topology-config", ReadOnly: true, MountPath: consts.VGPUTopologyConfigMountPath, SubPath: consts.VGPUTopologyConfigFileName}
×
3501
                driverContainer.VolumeMounts = append(driverContainer.VolumeMounts, topologyConfigVolMount)
×
3502

×
3503
                topologyConfigVolumeSource := corev1.VolumeSource{
×
3504
                        ConfigMap: &corev1.ConfigMapVolumeSource{
×
3505
                                LocalObjectReference: corev1.LocalObjectReference{
×
3506
                                        Name: config.Driver.VirtualTopology.Config,
×
3507
                                },
×
3508
                                Items: []corev1.KeyToPath{
×
3509
                                        {
×
3510
                                                Key:  consts.VGPUTopologyConfigFileName,
×
3511
                                                Path: consts.VGPUTopologyConfigFileName,
×
3512
                                        },
×
3513
                                },
×
3514
                        },
×
3515
                }
×
3516
                topologyConfigVol := corev1.Volume{Name: "topology-config", VolumeSource: topologyConfigVolumeSource}
×
3517
                podSpec.Volumes = append(podSpec.Volumes, topologyConfigVol)
×
3518
        }
×
3519

3520
        // mount any custom kernel module configuration parameters at /drivers
3521
        if config.Driver.KernelModuleConfig != nil && config.Driver.KernelModuleConfig.Name != "" {
1✔
3522
                destinationDir := "/drivers"
×
3523
                volumeMounts, itemsToInclude, err := createConfigMapVolumeMounts(n, config.Driver.KernelModuleConfig.Name, destinationDir)
×
3524
                if err != nil {
×
3525
                        return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for kernel module configuration: %v", err)
×
3526
                }
×
3527
                driverContainer.VolumeMounts = append(driverContainer.VolumeMounts, volumeMounts...)
×
3528
                podSpec.Volumes = append(podSpec.Volumes, createConfigMapVolume(config.Driver.KernelModuleConfig.Name, itemsToInclude))
×
3529
        }
3530

3531
        if len(config.Driver.Env) > 0 {
1✔
3532
                for _, env := range config.Driver.Env {
×
3533
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
×
3534
                }
×
3535
        }
3536

3537
        // no further repo configuration required when using pre-compiled drivers, return here.
3538
        if config.Driver.UsePrecompiledDrivers() {
2✔
3539
                return nil
1✔
3540
        }
1✔
3541

3542
        // set any custom repo configuration provided when using runfile based driver installation
3543
        if config.Driver.RepoConfig != nil && config.Driver.RepoConfig.ConfigMapName != "" {
1✔
3544
                destinationDir, err := getRepoConfigPath()
×
3545
                if err != nil {
×
3546
                        return fmt.Errorf("ERROR: failed to get destination directory for custom repo config: %v", err)
×
3547
                }
×
3548
                volumeMounts, itemsToInclude, err := createConfigMapVolumeMounts(n, config.Driver.RepoConfig.ConfigMapName, destinationDir)
×
3549
                if err != nil {
×
3550
                        return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for custom repo config: %v", err)
×
3551
                }
×
3552
                driverContainer.VolumeMounts = append(driverContainer.VolumeMounts, volumeMounts...)
×
3553
                podSpec.Volumes = append(podSpec.Volumes, createConfigMapVolume(config.Driver.RepoConfig.ConfigMapName, itemsToInclude))
×
3554
        }
3555

3556
        // set any custom ssl key/certificate configuration provided
3557
        if config.Driver.CertConfig != nil && config.Driver.CertConfig.Name != "" {
1✔
3558
                destinationDir, err := getCertConfigPath()
×
3559
                if err != nil {
×
3560
                        return fmt.Errorf("ERROR: failed to get destination directory for custom repo config: %v", err)
×
3561
                }
×
3562
                volumeMounts, itemsToInclude, err := createConfigMapVolumeMounts(n, config.Driver.CertConfig.Name, destinationDir)
×
3563
                if err != nil {
×
3564
                        return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for custom certs: %w", err)
×
3565
                }
×
3566
                driverContainer.VolumeMounts = append(driverContainer.VolumeMounts, volumeMounts...)
×
3567
                podSpec.Volumes = append(podSpec.Volumes, createConfigMapVolume(config.Driver.CertConfig.Name, itemsToInclude))
×
3568
        }
3569

3570
        secretName := config.Driver.SecretEnv
1✔
3571
        if len(secretName) > 0 {
2✔
3572
                err := createSecretEnvReference(n.ctx, n.client, secretName, n.operatorNamespace, driverContainer)
1✔
3573
                if err != nil {
1✔
3574
                        return fmt.Errorf("ERROR: failed to attach secret %s to the driver container: %w", secretName, err)
×
3575
                }
×
3576
        }
3577

3578
        release, err := parseOSRelease()
1✔
3579
        if err != nil {
1✔
3580
                return fmt.Errorf("ERROR: failed to get os-release: %s", err)
×
3581
        }
×
3582

3583
        // set up subscription entitlements for RHEL(using K8s with a non-CRIO runtime) and SLES
3584
        if (release["ID"] == "rhel" && n.openshift == "" && n.runtime != gpuv1.CRIO) || release["ID"] == "sles" || release["ID"] == "sl-micro" {
1✔
3585
                n.logger.Info("Mounting subscriptions into the driver container", "OS", release["ID"])
×
3586
                pathToVolumeSource, err := getSubscriptionPathsToVolumeSources()
×
3587
                if err != nil {
×
3588
                        return fmt.Errorf("ERROR: failed to get path items for subscription entitlements: %v", err)
×
3589
                }
×
3590

3591
                // sort host path volumes to ensure ordering is preserved when adding to pod spec
3592
                mountPaths := make([]string, 0, len(pathToVolumeSource))
×
3593
                for k := range pathToVolumeSource {
×
3594
                        mountPaths = append(mountPaths, k)
×
3595
                }
×
3596
                sort.Strings(mountPaths)
×
3597

×
3598
                for num, mountPath := range mountPaths {
×
3599
                        volMountSubscriptionName := fmt.Sprintf("subscription-config-%d", num)
×
3600

×
3601
                        volMountSubscription := corev1.VolumeMount{
×
3602
                                Name:      volMountSubscriptionName,
×
3603
                                MountPath: mountPath,
×
3604
                                ReadOnly:  true,
×
3605
                        }
×
3606
                        driverContainer.VolumeMounts = append(driverContainer.VolumeMounts, volMountSubscription)
×
3607

×
3608
                        subscriptionVol := corev1.Volume{Name: volMountSubscriptionName, VolumeSource: pathToVolumeSource[mountPath]}
×
3609
                        podSpec.Volumes = append(podSpec.Volumes, subscriptionVol)
×
3610
                }
×
3611
        }
3612

3613
        // apply proxy and env settings if this is an OpenShift cluster
3614
        if _, ok := release["OPENSHIFT_VERSION"]; ok {
1✔
3615
                setContainerEnv(driverContainer, "OPENSHIFT_VERSION", release["OPENSHIFT_VERSION"])
×
3616

×
3617
                // Automatically apply proxy settings for OCP and inject custom CA if configured by user
×
3618
                // https://docs.openshift.com/container-platform/4.6/networking/configuring-a-custom-pki.html
×
3619
                err = applyOCPProxySpec(n, podSpec)
×
3620
                if err != nil {
×
3621
                        return err
×
3622
                }
×
3623
        }
3624
        return nil
1✔
3625
}
3626

3627
func createSecretEnvReference(ctx context.Context, ctrlClient client.Client, secretName string,
3628
        namespace string, container *corev1.Container) error {
1✔
3629
        envFrom := container.EnvFrom
1✔
3630
        if len(envFrom) == 0 {
2✔
3631
                envFrom = make([]corev1.EnvFromSource, 0)
1✔
3632
        }
1✔
3633

3634
        // get the ConfigMap
3635
        sec := &corev1.Secret{}
1✔
3636
        opts := client.ObjectKey{Namespace: namespace, Name: secretName}
1✔
3637
        err := ctrlClient.Get(ctx, opts, sec)
1✔
3638
        if err != nil {
1✔
3639
                return fmt.Errorf("ERROR: could not get Secret %s from client: %w", secretName, err)
×
3640
        }
×
3641

3642
        secretEnvSource := corev1.EnvFromSource{
1✔
3643
                SecretRef: &corev1.SecretEnvSource{
1✔
3644
                        LocalObjectReference: corev1.LocalObjectReference{
1✔
3645
                                Name: sec.Name,
1✔
3646
                        },
1✔
3647
                }}
1✔
3648
        envFrom = append(envFrom, secretEnvSource)
1✔
3649
        container.EnvFrom = envFrom
1✔
3650
        return nil
1✔
3651
}
3652

3653
func transformVGPUManagerContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
3654
        container := findContainerByName(obj.Spec.Template.Spec.Containers, "nvidia-vgpu-manager-ctr")
1✔
3655

1✔
3656
        if container == nil {
1✔
3657
                return fmt.Errorf("failed to find nvidia-vgpu-manager-ctr in spec")
×
3658
        }
×
3659

3660
        image, err := resolveDriverTag(n, &config.VGPUManager)
1✔
3661
        if err != nil {
1✔
3662
                return err
×
3663
        }
×
3664
        if image != "" {
2✔
3665
                container.Image = image
1✔
3666
        }
1✔
3667

3668
        // update image pull policy
3669
        container.ImagePullPolicy = gpuv1.ImagePullPolicy(config.VGPUManager.ImagePullPolicy)
1✔
3670

1✔
3671
        // set image pull secrets
1✔
3672
        if len(config.VGPUManager.ImagePullSecrets) > 0 {
2✔
3673
                addPullSecrets(&obj.Spec.Template.Spec, config.VGPUManager.ImagePullSecrets)
1✔
3674
        }
1✔
3675
        // set resource limits
3676
        if config.VGPUManager.Resources != nil {
1✔
3677
                container.Resources.Requests = config.VGPUManager.Resources.Requests
×
3678
                container.Resources.Limits = config.VGPUManager.Resources.Limits
×
3679
        }
×
3680
        // set arguments if specified for driver container
3681
        if len(config.VGPUManager.Args) > 0 {
1✔
3682
                container.Args = config.VGPUManager.Args
×
3683
        }
×
3684

3685
        release, err := parseOSRelease()
1✔
3686
        if err != nil {
1✔
3687
                return fmt.Errorf("ERROR: failed to get os-release: %s", err)
×
3688
        }
×
3689

3690
        // add env for OCP
3691
        if _, ok := release["OPENSHIFT_VERSION"]; ok {
1✔
3692
                setContainerEnv(container, "OPENSHIFT_VERSION", release["OPENSHIFT_VERSION"])
×
3693
        }
×
3694

3695
        if len(config.VGPUManager.Env) > 0 {
1✔
3696
                for _, env := range config.VGPUManager.Env {
×
3697
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
×
3698
                }
×
3699
        }
3700

3701
        return nil
1✔
3702
}
3703

3704
func applyUpdateStrategyConfig(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) error {
1✔
3705
        switch config.Daemonsets.UpdateStrategy {
1✔
3706
        case "OnDelete":
1✔
3707
                obj.Spec.UpdateStrategy = appsv1.DaemonSetUpdateStrategy{Type: appsv1.OnDeleteDaemonSetStrategyType}
1✔
3708
        case "RollingUpdate":
1✔
3709
                fallthrough
1✔
3710
        default:
1✔
3711
                // update config for RollingUpdate strategy
1✔
3712
                if config.Daemonsets.RollingUpdate == nil || config.Daemonsets.RollingUpdate.MaxUnavailable == "" {
2✔
3713
                        return nil
1✔
3714
                }
1✔
3715
                if strings.HasPrefix(obj.Name, commonDriverDaemonsetName) {
2✔
3716
                        // disallow setting RollingUpdate strategy with the driver container
1✔
3717
                        return nil
1✔
3718
                }
1✔
3719
                var intOrString intstr.IntOrString
1✔
3720
                if strings.HasSuffix(config.Daemonsets.RollingUpdate.MaxUnavailable, "%") {
2✔
3721
                        intOrString = intstr.IntOrString{Type: intstr.String, StrVal: config.Daemonsets.RollingUpdate.MaxUnavailable}
1✔
3722
                } else {
2✔
3723
                        int64Val, err := strconv.ParseInt(config.Daemonsets.RollingUpdate.MaxUnavailable, 10, 32)
1✔
3724
                        if err != nil {
2✔
3725
                                return fmt.Errorf("failed to apply rolling update config: %s", err)
1✔
3726
                        }
1✔
3727
                        intOrString = intstr.IntOrString{Type: intstr.Int, IntVal: int32(int64Val)}
1✔
3728
                }
3729
                rollingUpdateSpec := appsv1.RollingUpdateDaemonSet{MaxUnavailable: &intOrString}
1✔
3730
                obj.Spec.UpdateStrategy = appsv1.DaemonSetUpdateStrategy{Type: appsv1.RollingUpdateDaemonSetStrategyType, RollingUpdate: &rollingUpdateSpec}
1✔
3731
        }
3732
        return nil
1✔
3733
}
3734

3735
func transformValidationInitContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) error {
1✔
3736
        for i, initContainer := range obj.Spec.Template.Spec.InitContainers {
2✔
3737
                // skip if not validation initContainer
1✔
3738
                if !strings.Contains(initContainer.Name, "validation") {
2✔
3739
                        continue
1✔
3740
                }
3741

3742
                // TODO: refactor the component-specific validation logic so that we are not duplicating TransformValidatorComponent()
3743
                // Pass env for driver-validation init container
3744
                if strings.HasPrefix(initContainer.Name, "driver") {
2✔
3745
                        if len(config.Validator.Driver.Env) > 0 {
2✔
3746
                                for _, env := range config.Validator.Driver.Env {
2✔
3747
                                        setContainerEnv(&(obj.Spec.Template.Spec.InitContainers[i]), env.Name, env.Value)
1✔
3748
                                }
1✔
3749
                        }
3750
                }
3751

3752
                // Pass env for toolkit-validation init container
3753
                if strings.HasPrefix(initContainer.Name, "toolkit") {
2✔
3754
                        if len(config.Validator.Toolkit.Env) > 0 {
2✔
3755
                                for _, env := range config.Validator.Toolkit.Env {
2✔
3756
                                        setContainerEnv(&(obj.Spec.Template.Spec.InitContainers[i]), env.Name, env.Value)
1✔
3757
                                }
1✔
3758
                        }
3759
                }
3760

3761
                // update validation image
3762
                image, err := gpuv1.ImagePath(&config.Validator)
1✔
3763
                if err != nil {
1✔
3764
                        return err
×
3765
                }
×
3766
                obj.Spec.Template.Spec.InitContainers[i].Image = image
1✔
3767
                // update validation image pull policy
1✔
3768
                if config.Validator.ImagePullPolicy != "" {
2✔
3769
                        obj.Spec.Template.Spec.InitContainers[i].ImagePullPolicy = gpuv1.ImagePullPolicy(config.Validator.ImagePullPolicy)
1✔
3770
                }
1✔
3771
                // update the security context for the validator container
3772
                transformValidatorSecurityContext(&obj.Spec.Template.Spec.InitContainers[i])
1✔
3773
        }
3774
        // add any pull secrets needed for validation image
3775
        if len(config.Validator.ImagePullSecrets) > 0 {
2✔
3776
                addPullSecrets(&obj.Spec.Template.Spec, config.Validator.ImagePullSecrets)
1✔
3777
        }
1✔
3778
        return nil
1✔
3779
}
3780

3781
func addPullSecrets(podSpec *corev1.PodSpec, secrets []string) {
1✔
3782
        for _, secret := range secrets {
2✔
3783
                if !containsSecret(podSpec.ImagePullSecrets, secret) {
2✔
3784
                        podSpec.ImagePullSecrets = append(podSpec.ImagePullSecrets, corev1.LocalObjectReference{Name: secret})
1✔
3785
                }
1✔
3786
        }
3787
}
3788

3789
func containsSecret(secrets []corev1.LocalObjectReference, secretName string) bool {
1✔
3790
        for _, s := range secrets {
2✔
3791
                if s.Name == secretName {
2✔
3792
                        return true
1✔
3793
                }
1✔
3794
        }
3795
        return false
1✔
3796
}
3797

3798
func isDeploymentReady(name string, n ClusterPolicyController) gpuv1.State {
×
3799
        opts := []client.ListOption{
×
3800
                client.MatchingLabels{"app": name},
×
3801
        }
×
3802
        n.logger.V(1).Info("Deployment", "LabelSelector", fmt.Sprintf("app=%s", name))
×
3803
        list := &appsv1.DeploymentList{}
×
3804
        err := n.client.List(n.ctx, list, opts...)
×
3805
        if err != nil {
×
3806
                n.logger.Info("Could not get DeploymentList", err)
×
3807
        }
×
3808
        n.logger.V(1).Info("Deployment", "NumberOfDeployment", len(list.Items))
×
3809
        if len(list.Items) == 0 {
×
3810
                return gpuv1.NotReady
×
3811
        }
×
3812

3813
        ds := list.Items[0]
×
3814
        n.logger.V(1).Info("Deployment", "NumberUnavailable", ds.Status.UnavailableReplicas)
×
3815

×
3816
        if ds.Status.UnavailableReplicas != 0 {
×
3817
                return gpuv1.NotReady
×
3818
        }
×
3819

3820
        return isPodReady(name, n, "Running")
×
3821
}
3822

3823
func isDaemonSetReady(name string, n ClusterPolicyController) gpuv1.State {
1✔
3824
        ctx := n.ctx
1✔
3825
        ds := &appsv1.DaemonSet{}
1✔
3826
        n.logger.V(2).Info("checking daemonset for readiness", "name", name)
1✔
3827
        err := n.client.Get(ctx, types.NamespacedName{Namespace: n.operatorNamespace, Name: name}, ds)
1✔
3828
        if err != nil {
1✔
3829
                n.logger.Error(err, "could not get daemonset", "name", name)
×
3830
        }
×
3831

3832
        if ds.Status.DesiredNumberScheduled == 0 {
2✔
3833
                n.logger.V(2).Info("Daemonset has desired pods of 0", "name", name)
1✔
3834
                return gpuv1.Ready
1✔
3835
        }
1✔
3836

3837
        if ds.Status.NumberUnavailable != 0 {
×
3838
                n.logger.Info("daemonset not ready", "name", name)
×
3839
                return gpuv1.NotReady
×
3840
        }
×
3841

3842
        // if ds is running with "OnDelete" strategy, check if the revision matches for all pods
3843
        if ds.Spec.UpdateStrategy.Type != appsv1.OnDeleteDaemonSetStrategyType {
×
3844
                return gpuv1.Ready
×
3845
        }
×
3846

3847
        opts := []client.ListOption{client.MatchingLabels(ds.Spec.Template.Labels)}
×
3848

×
3849
        n.logger.V(2).Info("Pod", "LabelSelector", fmt.Sprintf("app=%s", name))
×
3850
        list := &corev1.PodList{}
×
3851
        err = n.client.List(ctx, list, opts...)
×
3852
        if err != nil {
×
3853
                n.logger.Info("Could not get PodList", err)
×
3854
                return gpuv1.NotReady
×
3855
        }
×
3856
        n.logger.V(2).Info("Pod", "NumberOfPods", len(list.Items))
×
3857
        if len(list.Items) == 0 {
×
3858
                return gpuv1.NotReady
×
3859
        }
×
3860

3861
        dsPods := getPodsOwnedbyDaemonset(ds, list.Items, n)
×
3862
        daemonsetRevisionHash, err := getDaemonsetControllerRevisionHash(ctx, ds, n)
×
3863
        if err != nil {
×
3864
                n.logger.Error(
×
3865
                        err, "Failed to get daemonset template revision hash", "daemonset", ds)
×
3866
                return gpuv1.NotReady
×
3867
        }
×
3868
        n.logger.V(2).Info("daemonset template revision hash", "hash", daemonsetRevisionHash)
×
3869

×
3870
        for _, pod := range dsPods {
×
3871
                pod := pod
×
3872
                podRevisionHash, err := getPodControllerRevisionHash(ctx, &pod)
×
3873
                if err != nil {
×
3874
                        n.logger.Error(
×
3875
                                err, "Failed to get pod template revision hash", "pod", pod)
×
3876
                        return gpuv1.NotReady
×
3877
                }
×
3878
                n.logger.V(2).Info("pod template revision hash", "hash", podRevisionHash)
×
3879

×
3880
                // check if the revision hashes are matching and pod is in running state
×
3881
                if podRevisionHash != daemonsetRevisionHash || pod.Status.Phase != "Running" {
×
3882
                        return gpuv1.NotReady
×
3883
                }
×
3884

3885
                // If the pod generation matches the daemonset generation and the pod is running
3886
                // and it has at least 1 container
3887
                if len(pod.Status.ContainerStatuses) != 0 {
×
3888
                        for i := range pod.Status.ContainerStatuses {
×
3889
                                if !pod.Status.ContainerStatuses[i].Ready {
×
3890
                                        // Return false if at least 1 container isn't ready
×
3891
                                        return gpuv1.NotReady
×
3892
                                }
×
3893
                        }
3894
                }
3895
        }
3896

3897
        // All containers are ready
3898
        return gpuv1.Ready
×
3899
}
3900

3901
func getPodsOwnedbyDaemonset(ds *appsv1.DaemonSet, pods []corev1.Pod, n ClusterPolicyController) []corev1.Pod {
×
3902
        dsPodList := []corev1.Pod{}
×
3903
        for _, pod := range pods {
×
3904
                if len(pod.OwnerReferences) < 1 {
×
3905
                        n.logger.Info("Driver Pod has no owner DaemonSet", "pod", pod.Name)
×
3906
                        continue
×
3907
                }
3908
                n.logger.V(2).Info("Pod", "pod", pod.Name, "owner", pod.OwnerReferences[0].Name)
×
3909

×
3910
                if ds.UID != pod.OwnerReferences[0].UID {
×
3911
                        n.logger.Info("Driver Pod is not owned by a Driver DaemonSet",
×
3912
                                "pod", pod, "actual owner", pod.OwnerReferences[0])
×
3913
                        continue
×
3914
                }
3915
                dsPodList = append(dsPodList, pod)
×
3916
        }
3917
        return dsPodList
×
3918
}
3919

3920
func getPodControllerRevisionHash(ctx context.Context, pod *corev1.Pod) (string, error) {
×
3921
        if hash, ok := pod.Labels[PodControllerRevisionHashLabelKey]; ok {
×
3922
                return hash, nil
×
3923
        }
×
3924
        return "", fmt.Errorf("controller-revision-hash label not present for pod %s", pod.Name)
×
3925
}
3926

3927
func getDaemonsetControllerRevisionHash(ctx context.Context, daemonset *appsv1.DaemonSet, n ClusterPolicyController) (string, error) {
×
3928

×
3929
        // get all revisions for the daemonset
×
3930
        opts := []client.ListOption{
×
3931
                client.MatchingLabels(daemonset.Spec.Selector.MatchLabels),
×
3932
                client.InNamespace(n.operatorNamespace),
×
3933
        }
×
3934
        list := &appsv1.ControllerRevisionList{}
×
3935
        err := n.client.List(ctx, list, opts...)
×
3936
        if err != nil {
×
3937
                return "", fmt.Errorf("error getting controller revision list for daemonset %s: %v", daemonset.Name, err)
×
3938
        }
×
3939

3940
        n.logger.V(2).Info("obtained controller revisions", "Daemonset", daemonset.Name, "len", len(list.Items))
×
3941

×
3942
        var revisions []appsv1.ControllerRevision
×
3943
        for _, controllerRevision := range list.Items {
×
3944
                if strings.HasPrefix(controllerRevision.Name, daemonset.Name) {
×
3945
                        revisions = append(revisions, controllerRevision)
×
3946
                }
×
3947
        }
3948

3949
        if len(revisions) == 0 {
×
3950
                return "", fmt.Errorf("no revision found for daemonset %s", daemonset.Name)
×
3951
        }
×
3952

3953
        // sort the revision list to make sure we obtain latest revision always
3954
        sort.Slice(revisions, func(i, j int) bool { return revisions[i].Revision < revisions[j].Revision })
×
3955

3956
        currentRevision := revisions[len(revisions)-1]
×
3957
        hash := strings.TrimPrefix(currentRevision.Name, fmt.Sprintf("%s-", daemonset.Name))
×
3958

×
3959
        return hash, nil
×
3960
}
3961

3962
// Deployment creates Deployment resource
3963
func Deployment(n ClusterPolicyController) (gpuv1.State, error) {
×
3964
        ctx := n.ctx
×
3965
        state := n.idx
×
3966
        obj := n.resources[state].Deployment.DeepCopy()
×
3967
        obj.Namespace = n.operatorNamespace
×
3968

×
3969
        logger := n.logger.WithValues("Deployment", obj.Name, "Namespace", obj.Namespace)
×
3970

×
3971
        // Check if state is disabled and cleanup resource if exists
×
3972
        if !n.isStateEnabled(n.stateNames[n.idx]) {
×
3973
                err := n.client.Delete(ctx, obj)
×
3974
                if err != nil && !apierrors.IsNotFound(err) {
×
3975
                        logger.Info("Couldn't delete", "Error", err)
×
3976
                        return gpuv1.NotReady, err
×
3977
                }
×
3978
                return gpuv1.Disabled, nil
×
3979
        }
3980

3981
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
×
3982
                return gpuv1.NotReady, err
×
3983
        }
×
3984

3985
        if err := n.client.Create(ctx, obj); err != nil {
×
3986
                if apierrors.IsAlreadyExists(err) {
×
3987
                        logger.Info("Found Resource, updating...")
×
3988
                        err = n.client.Update(ctx, obj)
×
3989
                        if err != nil {
×
3990
                                logger.Info("Couldn't update", "Error", err)
×
3991
                                return gpuv1.NotReady, err
×
3992
                        }
×
3993
                        return isDeploymentReady(obj.Name, n), nil
×
3994
                }
3995

3996
                logger.Info("Couldn't create", "Error", err)
×
3997
                return gpuv1.NotReady, err
×
3998
        }
3999

4000
        return isDeploymentReady(obj.Name, n), nil
×
4001
}
4002

4003
func ocpHasDriverToolkitImageStream(n *ClusterPolicyController) (bool, error) {
×
4004
        ctx := n.ctx
×
4005
        found := &apiimagev1.ImageStream{}
×
4006
        name := "driver-toolkit"
×
4007
        namespace := consts.OpenshiftNamespace
×
4008
        err := n.client.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, found)
×
4009
        if err != nil {
×
4010
                if apierrors.IsNotFound(err) {
×
4011
                        n.logger.Info("ocpHasDriverToolkitImageStream: driver-toolkit imagestream not found",
×
4012
                                "Name", name,
×
4013
                                "Namespace", namespace)
×
4014

×
4015
                        return false, nil
×
4016
                }
×
4017

4018
                n.logger.Info("Couldn't get the driver-toolkit imagestream", "Error", err)
×
4019

×
4020
                return false, err
×
4021
        }
4022
        n.logger.V(1).Info("ocpHasDriverToolkitImageStream: driver-toolkit imagestream found")
×
4023
        isBroken := false
×
4024
        for _, tag := range found.Spec.Tags {
×
4025
                if tag.Name == "" {
×
4026
                        isBroken = true
×
4027
                        continue
×
4028
                }
4029
                if tag.Name == "latest" || tag.From == nil {
×
4030
                        continue
×
4031
                }
4032
                n.logger.V(1).Info("ocpHasDriverToolkitImageStream: tag", tag.Name, tag.From.Name)
×
4033
                n.ocpDriverToolkit.rhcosDriverToolkitImages[tag.Name] = tag.From.Name
×
4034
        }
4035
        if isBroken {
×
4036
                n.logger.Info("WARNING: ocpHasDriverToolkitImageStream: driver-toolkit imagestream is broken, see RHBZ#2015024")
×
4037

×
4038
                n.operatorMetrics.openshiftDriverToolkitIsBroken.Set(1)
×
4039
        } else {
×
4040
                n.operatorMetrics.openshiftDriverToolkitIsBroken.Set(0)
×
4041
        }
×
4042

4043
        return true, nil
×
4044
}
4045

4046
func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) error {
×
4047
        // Get all DaemonSets owned by ClusterPolicy
×
4048
        //
×
4049
        // (cdesiniotis) There is a limitation with the controller-runtime client where only a single field selector
×
4050
        // is allowed when specifying ListOptions or DeleteOptions.
×
4051
        // See GH issue: https://github.com/kubernetes-sigs/controller-runtime/issues/612
×
4052
        list := &appsv1.DaemonSetList{}
×
4053
        err := n.client.List(ctx, list, client.MatchingFields{clusterPolicyControllerIndexKey: n.singleton.Name})
×
4054
        if err != nil {
×
4055
                return fmt.Errorf("failed to list all NVIDIA driver daemonsets owned by ClusterPolicy: %w", err)
×
4056
        }
×
4057

4058
        for _, ds := range list.Items {
×
4059
                ds := ds
×
4060
                // filter out DaemonSets which are not the NVIDIA driver/vgpu-manager
×
4061
                if strings.HasPrefix(ds.Name, commonDriverDaemonsetName) || strings.HasPrefix(ds.Name, commonVGPUManagerDaemonsetName) {
×
4062
                        n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy", "Name", ds.Name)
×
4063
                        err = n.client.Delete(ctx, &ds)
×
4064
                        if err != nil {
×
4065
                                return fmt.Errorf("error deleting NVIDIA driver daemonset: %w", err)
×
4066
                        }
×
4067
                }
4068
        }
4069

4070
        return nil
×
4071
}
4072

4073
// cleanupStalePrecompiledDaemonsets deletes stale driver daemonsets which can happen
4074
// 1. If all nodes upgraded to the latest kernel
4075
// 2. no GPU nodes are present
4076
func (n ClusterPolicyController) cleanupStalePrecompiledDaemonsets(ctx context.Context) error {
1✔
4077
        opts := []client.ListOption{
1✔
4078
                client.MatchingLabels{
1✔
4079
                        precompiledIdentificationLabelKey: precompiledIdentificationLabelValue,
1✔
4080
                },
1✔
4081
        }
1✔
4082
        list := &appsv1.DaemonSetList{}
1✔
4083
        err := n.client.List(ctx, list, opts...)
1✔
4084
        if err != nil {
1✔
4085
                n.logger.Error(err, "could not get daemonset list")
×
4086
                return err
×
4087
        }
×
4088

4089
        for idx := range list.Items {
1✔
4090
                ds := list.Items[idx]
×
4091
                name := ds.Name
×
4092
                desiredNumberScheduled := ds.Status.DesiredNumberScheduled
×
4093
                numberMisscheduled := ds.Status.NumberMisscheduled
×
4094

×
4095
                n.logger.V(1).Info("Driver DaemonSet found",
×
4096
                        "Name", name,
×
4097
                        "Status.DesiredNumberScheduled", desiredNumberScheduled)
×
4098

×
4099
                // We consider a daemonset to be stale only if it has no desired number of pods and no pods currently mis-scheduled
×
4100
                // As per the Kubernetes docs, a daemonset pod is mis-scheduled when an already scheduled pod no longer satisfies
×
4101
                // node affinity constraints or has un-tolerated taints, for e.g. "node.kubernetes.io/unreachable:NoSchedule"
×
4102
                if desiredNumberScheduled == 0 && numberMisscheduled == 0 {
×
4103
                        n.logger.Info("Delete Driver DaemonSet", "Name", name)
×
4104

×
4105
                        err = n.client.Delete(ctx, &ds)
×
4106
                        if err != nil {
×
4107
                                n.logger.Error(err, "Could not get delete DaemonSet",
×
4108
                                        "Name", name)
×
4109
                        }
×
4110
                } else {
×
4111
                        n.logger.Info("Driver DaemonSet active, keep it.",
×
4112
                                "Name", name,
×
4113
                                "Status.DesiredNumberScheduled", desiredNumberScheduled)
×
4114
                }
×
4115
        }
4116
        return nil
1✔
4117
}
4118

4119
// precompiledDriverDaemonsets goes through all the kernel versions
4120
// found in the cluster, sets `currentKernelVersion` and calls the
4121
// original DaemonSet() function to create/update the kernel-specific
4122
// DaemonSet.
4123
func precompiledDriverDaemonsets(ctx context.Context, n ClusterPolicyController) (gpuv1.State, []error) {
1✔
4124
        overallState := gpuv1.Ready
1✔
4125
        var errs []error
1✔
4126
        n.logger.Info("cleaning any stale precompiled driver daemonsets")
1✔
4127
        err := n.cleanupStalePrecompiledDaemonsets(ctx)
1✔
4128
        if err != nil {
1✔
4129
                return gpuv1.NotReady, append(errs, err)
×
4130
        }
×
4131

4132
        n.logger.V(1).Info("preparing pre-compiled driver daemonsets")
1✔
4133
        for kernelVersion, os := range n.kernelVersionMap {
2✔
4134
                // set current kernel version
1✔
4135
                n.currentKernelVersion = kernelVersion
1✔
4136

1✔
4137
                n.logger.Info("preparing pre-compiled driver daemonset",
1✔
4138
                        "version", n.currentKernelVersion, "os", os)
1✔
4139

1✔
4140
                state, err := DaemonSet(n)
1✔
4141
                if state != gpuv1.Ready {
1✔
4142
                        n.logger.Info("pre-compiled driver daemonset not ready",
×
4143
                                "version", n.currentKernelVersion, "state", state)
×
4144
                        overallState = state
×
4145
                }
×
4146
                if err != nil {
1✔
4147
                        errs = append(errs, fmt.Errorf("failed to handle Precompiled Driver Daemonset for version %s: %v", kernelVersion, err))
×
4148
                }
×
4149
        }
4150

4151
        // reset current kernel version
4152
        n.currentKernelVersion = ""
1✔
4153
        return overallState, errs
1✔
4154
}
4155

4156
// ocpDriverToolkitDaemonSets goes through all the RHCOS versions
4157
// found in the cluster, sets `currentRhcosVersion` and calls the
4158
// original DaemonSet() function to create/update the RHCOS-specific
4159
// DaemonSet.
4160
func (n ClusterPolicyController) ocpDriverToolkitDaemonSets(ctx context.Context) (gpuv1.State, error) {
×
4161
        err := n.ocpCleanupStaleDriverToolkitDaemonSets(ctx)
×
4162
        if err != nil {
×
4163
                return gpuv1.NotReady, err
×
4164
        }
×
4165

4166
        n.logger.V(1).Info("preparing DriverToolkit DaemonSet",
×
4167
                "rhcos", n.ocpDriverToolkit.rhcosVersions)
×
4168

×
4169
        overallState := gpuv1.Ready
×
4170
        var errs error
×
4171

×
4172
        for rhcosVersion := range n.ocpDriverToolkit.rhcosVersions {
×
4173
                n.ocpDriverToolkit.currentRhcosVersion = rhcosVersion
×
4174

×
4175
                n.logger.V(1).Info("preparing DriverToolkit DaemonSet",
×
4176
                        "rhcosVersion", n.ocpDriverToolkit.currentRhcosVersion)
×
4177

×
4178
                state, err := DaemonSet(n)
×
4179

×
4180
                n.logger.V(1).Info("preparing DriverToolkit DaemonSet",
×
4181
                        "rhcosVersion", n.ocpDriverToolkit.currentRhcosVersion, "state", state)
×
4182
                if state != gpuv1.Ready {
×
4183
                        overallState = state
×
4184
                }
×
4185

4186
                if err != nil {
×
4187
                        if errs == nil {
×
4188
                                errs = err
×
4189
                        }
×
4190
                        errs = fmt.Errorf("failed to handle OpenShift Driver Toolkit Daemonset for version %s: %v", rhcosVersion, errs)
×
4191
                }
4192
        }
4193

4194
        n.ocpDriverToolkit.currentRhcosVersion = ""
×
4195

×
4196
        tagsMissing := false
×
4197
        for rhcosVersion, image := range n.ocpDriverToolkit.rhcosDriverToolkitImages {
×
4198
                if image != "" {
×
4199
                        continue
×
4200
                }
4201
                n.logger.Info("WARNINGs: RHCOS driver-toolkit image missing. Version-specific fallback mode enabled.", "rhcosVersion", rhcosVersion)
×
4202
                tagsMissing = true
×
4203
        }
4204
        if tagsMissing {
×
4205
                n.operatorMetrics.openshiftDriverToolkitRhcosTagsMissing.Set(1)
×
4206
        } else {
×
4207
                n.operatorMetrics.openshiftDriverToolkitRhcosTagsMissing.Set(0)
×
4208
        }
×
4209

4210
        return overallState, errs
×
4211
}
4212

4213
// ocpCleanupStaleDriverToolkitDaemonSets scans the DriverToolkit
4214
// RHCOS-version specific DaemonSets, and deletes the unused one:
4215
// - RHCOS version wasn't found in the node labels (upgrade finished)
4216
// - RHCOS version marked for deletion earlier in the Reconciliation loop (currently unexpected)
4217
// - no RHCOS version label (unexpected)
4218
// The DaemonSet set is kept if:
4219
// - RHCOS version was found in the node labels (most likely case)
4220
func (n ClusterPolicyController) ocpCleanupStaleDriverToolkitDaemonSets(ctx context.Context) error {
×
4221
        opts := []client.ListOption{
×
4222
                client.MatchingLabels{
×
4223
                        ocpDriverToolkitIdentificationLabel: ocpDriverToolkitIdentificationValue,
×
4224
                },
×
4225
        }
×
4226

×
4227
        list := &appsv1.DaemonSetList{}
×
4228
        err := n.client.List(ctx, list, opts...)
×
4229
        if err != nil {
×
4230
                n.logger.Info("ERROR: Could not get DaemonSetList", "Error", err)
×
4231
                return err
×
4232
        }
×
4233

4234
        for idx := range list.Items {
×
4235
                name := list.Items[idx].Name
×
4236
                dsRhcosVersion, versionOk := list.Items[idx].Labels[ocpDriverToolkitVersionLabel]
×
4237
                clusterHasRhcosVersion, clusterOk := n.ocpDriverToolkit.rhcosVersions[dsRhcosVersion]
×
4238
                desiredNumberScheduled := list.Items[idx].Status.DesiredNumberScheduled
×
4239

×
4240
                n.logger.V(1).Info("Driver DaemonSet found",
×
4241
                        "Name", name,
×
4242
                        "dsRhcosVersion", dsRhcosVersion,
×
4243
                        "clusterHasRhcosVersion", clusterHasRhcosVersion,
×
4244
                        "desiredNumberScheduled", desiredNumberScheduled)
×
4245

×
4246
                if desiredNumberScheduled != 0 {
×
4247
                        n.logger.Info("Driver DaemonSet active, keep it.",
×
4248
                                "Name", name, "Status.DesiredNumberScheduled", desiredNumberScheduled)
×
4249
                        continue
×
4250
                }
4251

4252
                if !versionOk {
×
4253
                        n.logger.Info("WARNING: Driver DaemonSet doesn't have DriverToolkit version label",
×
4254
                                "Name", name, "Label", ocpDriverToolkitVersionLabel,
×
4255
                        )
×
4256
                } else {
×
4257
                        switch {
×
4258
                        case !clusterOk:
×
4259
                                n.logger.V(1).Info("Driver DaemonSet RHCOS version NOT part of the cluster",
×
4260
                                        "Name", name, "RHCOS version", dsRhcosVersion,
×
4261
                                )
×
4262
                        case clusterHasRhcosVersion:
×
4263
                                n.logger.V(1).Info("Driver DaemonSet RHCOS version is part of the cluster, keep it.",
×
4264
                                        "Name", name, "RHCOS version", dsRhcosVersion,
×
4265
                                )
×
4266

×
4267
                                // the version of RHCOS targeted by this DS is part of the cluster
×
4268
                                // keep it alive
×
4269

×
4270
                                continue
×
4271
                        default: /* clusterHasRhcosVersion == false */
×
4272
                                // currently unexpected
×
4273
                                n.logger.V(1).Info("Driver DaemonSet RHCOS version marked for deletion",
×
4274
                                        "Name", name, "RHCOS version", dsRhcosVersion,
×
4275
                                )
×
4276
                        }
4277
                }
4278

4279
                n.logger.Info("Delete Driver DaemonSet", "Name", name)
×
4280
                err = n.client.Delete(ctx, &list.Items[idx])
×
4281
                if err != nil {
×
4282
                        n.logger.Info("ERROR: Could not get delete DaemonSet",
×
4283
                                "Name", name, "Error", err)
×
4284
                        return err
×
4285
                }
×
4286
        }
4287
        return nil
×
4288
}
4289

4290
// cleanupUnusedVGPUManagerDaemonsets cleans up the vgpu-manager DaemonSet(s)
4291
// according to the operator.useOCPDriverToolkit is enabled for ocp
4292
// This allows switching toggling the flag after the initial deployment.  If no
4293
// error happens, returns the number of Pods belonging to these
4294
// DaemonSets.
4295
func (n ClusterPolicyController) cleanupUnusedVGPUManagerDaemonsets(ctx context.Context) (int, error) {
1✔
4296
        podCount := 0
1✔
4297
        if n.openshift == "" {
2✔
4298
                return podCount, nil
1✔
4299
        }
1✔
4300

4301
        if !n.ocpDriverToolkit.enabled {
×
4302
                // cleanup DTK daemonsets
×
4303
                count, err := n.cleanupDriverDaemonsets(ctx,
×
4304
                        ocpDriverToolkitIdentificationLabel,
×
4305
                        ocpDriverToolkitIdentificationValue, commonVGPUManagerDaemonsetName)
×
4306
                if err != nil {
×
4307
                        return 0, err
×
4308
                }
×
4309
                podCount = count
×
4310
        } else {
×
4311
                // cleanup legacy vgpu-manager daemonsets
×
4312
                count, err := n.cleanupDriverDaemonsets(ctx,
×
4313
                        appLabelKey,
×
4314
                        commonVGPUManagerDaemonsetName, commonVGPUManagerDaemonsetName)
×
4315
                if err != nil {
×
4316
                        return 0, err
×
4317
                }
×
4318
                podCount = count
×
4319
        }
4320
        return podCount, nil
×
4321
}
4322

4323
// cleanupUnusedDriverDaemonSets cleans up the driver DaemonSet(s)
4324
// according to following.
4325
// 1. If driver.usePrecompiled is enabled
4326
// 2. if operator.useOCPDriverToolkit is enabled for ocp
4327
// This allows switching toggling the flag after the initial deployment.  If no
4328
// error happens, returns the number of Pods belonging to these
4329
// DaemonSets.
4330
func (n ClusterPolicyController) cleanupUnusedDriverDaemonSets(ctx context.Context) (int, error) {
1✔
4331
        podCount := 0
1✔
4332
        if n.openshift != "" {
1✔
4333
                switch {
×
4334
                case n.singleton.Spec.Driver.UsePrecompiledDrivers():
×
4335
                        // cleanup DTK daemonsets
×
4336
                        count, err := n.cleanupDriverDaemonsets(ctx,
×
4337
                                ocpDriverToolkitIdentificationLabel,
×
4338
                                ocpDriverToolkitIdentificationValue, commonDriverDaemonsetName)
×
4339
                        if err != nil {
×
4340
                                return 0, err
×
4341
                        }
×
4342
                        podCount = count
×
4343
                        // cleanup legacy driver daemonsets that use run file
×
4344
                        count, err = n.cleanupDriverDaemonsets(ctx,
×
4345
                                precompiledIdentificationLabelKey,
×
4346
                                "false", commonDriverDaemonsetName)
×
4347
                        if err != nil {
×
4348
                                return 0, err
×
4349
                        }
×
4350
                        podCount += count
×
4351

4352
                case n.ocpDriverToolkit.enabled:
×
4353
                        // cleanup pre-compiled and legacy driver daemonsets
×
4354
                        count, err := n.cleanupDriverDaemonsets(ctx,
×
4355
                                appLabelKey,
×
4356
                                commonDriverDaemonsetName, commonDriverDaemonsetName)
×
4357
                        if err != nil {
×
4358
                                return 0, err
×
4359
                        }
×
4360
                        podCount = count
×
4361
                default:
×
4362
                        // cleanup pre-compiled
×
4363
                        count, err := n.cleanupDriverDaemonsets(ctx,
×
4364
                                precompiledIdentificationLabelKey,
×
4365
                                precompiledIdentificationLabelValue, commonDriverDaemonsetName)
×
4366
                        if err != nil {
×
4367
                                return 0, err
×
4368
                        }
×
4369
                        podCount = count
×
4370

×
4371
                        // cleanup DTK daemonsets
×
4372
                        count, err = n.cleanupDriverDaemonsets(ctx,
×
4373
                                ocpDriverToolkitIdentificationLabel,
×
4374
                                ocpDriverToolkitIdentificationValue, commonDriverDaemonsetName)
×
4375
                        if err != nil {
×
4376
                                return 0, err
×
4377
                        }
×
4378
                        podCount += count
×
4379
                }
4380
        } else {
1✔
4381
                if n.singleton.Spec.Driver.UsePrecompiledDrivers() {
2✔
4382
                        // cleanup legacy driver daemonsets that use run file
1✔
4383
                        count, err := n.cleanupDriverDaemonsets(ctx,
1✔
4384
                                precompiledIdentificationLabelKey,
1✔
4385
                                "false", commonDriverDaemonsetName)
1✔
4386
                        if err != nil {
1✔
4387
                                return 0, err
×
4388
                        }
×
4389
                        podCount = count
1✔
4390
                } else {
1✔
4391
                        // cleanup pre-compiled driver daemonsets
1✔
4392
                        count, err := n.cleanupDriverDaemonsets(ctx,
1✔
4393
                                precompiledIdentificationLabelKey,
1✔
4394
                                precompiledIdentificationLabelValue, commonDriverDaemonsetName)
1✔
4395
                        if err != nil {
1✔
4396
                                return 0, err
×
4397
                        }
×
4398
                        podCount = count
1✔
4399
                }
4400
        }
4401
        return podCount, nil
1✔
4402
}
4403

4404
// cleanupDriverDaemonSets deletes the DaemonSets matching a given key/value
4405
// pairs If no error happens, returns the number of Pods belonging to
4406
// the DaemonSet.
4407
func (n ClusterPolicyController) cleanupDriverDaemonsets(ctx context.Context, searchKey string, searchValue string, namePrefix string) (int, error) {
1✔
4408
        var opts = []client.ListOption{client.MatchingLabels{searchKey: searchValue}}
1✔
4409

1✔
4410
        dsList := &appsv1.DaemonSetList{}
1✔
4411
        if err := n.client.List(ctx, dsList, opts...); err != nil {
1✔
4412
                n.logger.Error(err, "Could not get DaemonSetList")
×
4413
                return 0, err
×
4414
        }
×
4415

4416
        var lastErr error
1✔
4417
        for idx := range dsList.Items {
1✔
4418
                n.logger.Info("Delete DaemonSet",
×
4419
                        "Name", dsList.Items[idx].Name,
×
4420
                )
×
4421
                // ignore daemonsets that doesn't match the required name
×
4422
                if !strings.HasPrefix(dsList.Items[idx].Name, namePrefix) {
×
4423
                        continue
×
4424
                }
4425
                if err := n.client.Delete(ctx, &dsList.Items[idx]); err != nil {
×
4426
                        n.logger.Error(err, "Could not get delete DaemonSet",
×
4427
                                "Name", dsList.Items[idx].Name)
×
4428
                        lastErr = err
×
4429
                }
×
4430
        }
4431

4432
        // return the last error that occurred, if any
4433
        if lastErr != nil {
1✔
4434
                return 0, lastErr
×
4435
        }
×
4436

4437
        podList := &corev1.PodList{}
1✔
4438
        if err := n.client.List(ctx, podList, opts...); err != nil {
1✔
4439
                n.logger.Info("ERROR: Could not get PodList", "Error", err)
×
4440
                return 0, err
×
4441
        }
×
4442

4443
        podCount := 0
1✔
4444
        for idx := range podList.Items {
1✔
4445
                // ignore pods that doesn't match the required name
×
4446
                if !strings.HasPrefix(podList.Items[idx].Name, namePrefix) {
×
4447
                        continue
×
4448
                }
4449
                podCount++
×
4450
        }
4451
        return podCount, nil
1✔
4452
}
4453

4454
// DaemonSet creates Daemonset resource
4455
func DaemonSet(n ClusterPolicyController) (gpuv1.State, error) {
1✔
4456
        ctx := n.ctx
1✔
4457
        state := n.idx
1✔
4458
        obj := n.resources[state].DaemonSet.DeepCopy()
1✔
4459
        obj.Namespace = n.operatorNamespace
1✔
4460

1✔
4461
        logger := n.logger.WithValues("DaemonSet", obj.Name, "Namespace", obj.Namespace)
1✔
4462

1✔
4463
        // Check if state is disabled and cleanup resource if exists
1✔
4464
        if !n.isStateEnabled(n.stateNames[n.idx]) {
1✔
4465
                err := n.client.Delete(ctx, obj)
×
4466
                if err != nil && !apierrors.IsNotFound(err) {
×
4467
                        logger.Info("Couldn't delete", "Error", err)
×
4468
                        return gpuv1.NotReady, err
×
4469
                }
×
4470
                return gpuv1.Disabled, nil
×
4471
        }
4472

4473
        if !n.hasGPUNodes {
1✔
4474
                // multiple DaemonSets (eg, driver, dgcm-exporter) cannot be
×
4475
                // deployed without knowing the OS name, so skip their
×
4476
                // deployment for now. The operator will be notified
×
4477
                // (addWatchNewGPUNode) when new nodes will join the cluster.
×
4478
                logger.Info("No GPU node in the cluster, do not create DaemonSets")
×
4479
                return gpuv1.Ready, nil
×
4480
        }
×
4481

4482
        if n.resources[state].DaemonSet.GetName() == commonDriverDaemonsetName {
2✔
4483
                podCount, err := n.cleanupUnusedDriverDaemonSets(n.ctx)
1✔
4484
                if err != nil {
1✔
4485
                        return gpuv1.NotReady, err
×
4486
                }
×
4487
                if podCount != 0 {
1✔
4488
                        logger.Info("Driver DaemonSet cleanup in progress", "podCount", podCount)
×
4489
                        return gpuv1.NotReady, nil
×
4490
                }
×
4491

4492
                // Daemonsets using pre-compiled packages or using driver-toolkit (openshift) require creation of
4493
                // one daemonset per kernel version (or rhcos version).
4494
                // If currentKernelVersion or currentRhcosVersion (ocp) are not set, we intercept here
4495
                // and call Daemonset() per specific version
4496
                if n.singleton.Spec.Driver.UsePrecompiledDrivers() {
2✔
4497
                        if n.currentKernelVersion == "" {
2✔
4498
                                overallState, errs := precompiledDriverDaemonsets(ctx, n)
1✔
4499
                                if len(errs) != 0 {
1✔
4500
                                        // log errors
×
4501
                                        return overallState, fmt.Errorf("unable to deploy precompiled driver daemonsets %v", errs)
×
4502
                                }
×
4503
                                return overallState, nil
1✔
4504
                        }
4505
                } else if n.openshift != "" && n.ocpDriverToolkit.enabled &&
1✔
4506
                        n.ocpDriverToolkit.currentRhcosVersion == "" {
1✔
4507
                        return n.ocpDriverToolkitDaemonSets(ctx)
×
4508
                }
×
4509
        } else if n.resources[state].DaemonSet.Name == commonVGPUManagerDaemonsetName {
2✔
4510
                podCount, err := n.cleanupUnusedVGPUManagerDaemonsets(ctx)
1✔
4511
                if err != nil {
1✔
4512
                        return gpuv1.NotReady, err
×
4513
                }
×
4514
                if podCount != 0 {
1✔
4515
                        logger.Info("Driver DaemonSet cleanup in progress", "podCount", podCount)
×
4516
                        return gpuv1.NotReady, nil
×
4517
                }
×
4518
                if n.openshift != "" && n.ocpDriverToolkit.enabled &&
1✔
4519
                        n.ocpDriverToolkit.currentRhcosVersion == "" {
1✔
4520
                        // OpenShift Driver Toolkit requires the creation of
×
4521
                        // one Driver DaemonSet per RHCOS version (stored in
×
4522
                        // n.ocpDriverToolkit.rhcosVersions).
×
4523
                        //
×
4524
                        // Here, we are at the top-most call of DaemonSet(),
×
4525
                        // as currentRhcosVersion is unset.
×
4526
                        //
×
4527
                        // Initiate the multi-DaemonSet OCP DriverToolkit
×
4528
                        // deployment.
×
4529
                        return n.ocpDriverToolkitDaemonSets(ctx)
×
4530
                }
×
4531
        }
4532

4533
        err := preProcessDaemonSet(obj, n)
1✔
4534
        if err != nil {
1✔
4535
                logger.Info("Could not pre-process", "Error", err)
×
4536
                return gpuv1.NotReady, err
×
4537
        }
×
4538

4539
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
4540
                logger.Info("SetControllerReference failed", "Error", err)
×
4541
                return gpuv1.NotReady, err
×
4542
        }
×
4543

4544
        if obj.Labels == nil {
1✔
4545
                obj.Labels = make(map[string]string)
×
4546
        }
×
4547

4548
        for labelKey, labelValue := range n.singleton.Spec.Daemonsets.Labels {
1✔
4549
                obj.Labels[labelKey] = labelValue
×
4550
        }
×
4551

4552
        // Daemonsets will always have at least one annotation applied, so allocate if necessary
4553
        if obj.Annotations == nil {
1✔
4554
                obj.Annotations = make(map[string]string)
×
4555
        }
×
4556

4557
        for annoKey, annoValue := range n.singleton.Spec.Daemonsets.Annotations {
1✔
4558
                obj.Annotations[annoKey] = annoValue
×
4559
        }
×
4560

4561
        found := &appsv1.DaemonSet{}
1✔
4562
        err = n.client.Get(ctx, types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
1✔
4563
        if err != nil && apierrors.IsNotFound(err) {
2✔
4564
                logger.Info("DaemonSet not found, creating",
1✔
4565
                        "Name", obj.Name,
1✔
4566
                )
1✔
4567
                // generate hash for the spec to create
1✔
4568
                hashStr := utils.GetObjectHash(obj)
1✔
4569
                // add annotation to the Daemonset with hash value during creation
1✔
4570
                obj.Annotations[NvidiaAnnotationHashKey] = hashStr
1✔
4571
                err = n.client.Create(ctx, obj)
1✔
4572
                if err != nil {
1✔
4573
                        logger.Info("Couldn't create DaemonSet",
×
4574
                                "Name", obj.Name,
×
4575
                                "Error", err,
×
4576
                        )
×
4577
                        return gpuv1.NotReady, err
×
4578
                }
×
4579
                return isDaemonSetReady(obj.Name, n), nil
1✔
4580
        } else if err != nil {
×
4581
                logger.Info("Failed to get DaemonSet from client",
×
4582
                        "Name", obj.Name,
×
4583
                        "Error", err.Error())
×
4584
                return gpuv1.NotReady, err
×
4585
        }
×
4586

4587
        changed := isDaemonsetSpecChanged(found, obj)
×
4588
        if changed {
×
4589
                logger.Info("DaemonSet is different, updating", "name", obj.Name)
×
4590
                err = n.client.Update(ctx, obj)
×
4591
                if err != nil {
×
4592
                        return gpuv1.NotReady, err
×
4593
                }
×
4594
        } else {
×
4595
                logger.Info("DaemonSet identical, skipping update", "name", obj.Name)
×
4596
        }
×
4597
        return isDaemonSetReady(obj.Name, n), nil
×
4598
}
4599

4600
// isDaemonsetSpecChanged returns true if the spec has changed between existing one
4601
// and new Daemonset spec compared by hash.
4602
func isDaemonsetSpecChanged(current *appsv1.DaemonSet, new *appsv1.DaemonSet) bool {
×
4603
        if current == nil && new != nil {
×
4604
                return true
×
4605
        }
×
4606
        if current.Annotations == nil || new.Annotations == nil {
×
4607
                panic("appsv1.DaemonSet.Annotations must be allocated prior to calling isDaemonsetSpecChanged()")
×
4608
        }
4609

4610
        hashStr := utils.GetObjectHash(new)
×
4611
        foundHashAnnotation := false
×
4612

×
4613
        for annotation, value := range current.Annotations {
×
4614
                if annotation == NvidiaAnnotationHashKey {
×
4615
                        if value != hashStr {
×
4616
                                // update annotation to be added to Daemonset as per new spec and indicate spec update is required
×
4617
                                new.Annotations[NvidiaAnnotationHashKey] = hashStr
×
4618
                                return true
×
4619
                        }
×
4620
                        foundHashAnnotation = true
×
4621
                        break
×
4622
                }
4623
        }
4624

4625
        if !foundHashAnnotation {
×
4626
                // update annotation to be added to Daemonset as per new spec and indicate spec update is required
×
4627
                new.Annotations[NvidiaAnnotationHashKey] = hashStr
×
4628
                return true
×
4629
        }
×
4630
        return false
×
4631
}
4632

4633
// The operator starts two pods in different stages to validate
4634
// the correct working of the DaemonSets (driver and dp). Therefore
4635
// the operator waits until the Pod completes and checks the error status
4636
// to advance to the next state.
4637
func isPodReady(name string, n ClusterPolicyController, phase corev1.PodPhase) gpuv1.State {
×
4638
        ctx := n.ctx
×
4639
        opts := []client.ListOption{&client.MatchingLabels{"app": name}}
×
4640

×
4641
        n.logger.V(1).Info("Pod", "LabelSelector", fmt.Sprintf("app=%s", name))
×
4642
        list := &corev1.PodList{}
×
4643
        err := n.client.List(ctx, list, opts...)
×
4644
        if err != nil {
×
4645
                n.logger.Info("Could not get PodList", err)
×
4646
        }
×
4647
        n.logger.V(1).Info("Pod", "NumberOfPods", len(list.Items))
×
4648
        if len(list.Items) == 0 {
×
4649
                return gpuv1.NotReady
×
4650
        }
×
4651

4652
        pd := list.Items[0]
×
4653

×
4654
        if pd.Status.Phase != phase {
×
4655
                n.logger.V(1).Info("Pod", "Phase", pd.Status.Phase, "!=", phase)
×
4656
                return gpuv1.NotReady
×
4657
        }
×
4658
        n.logger.V(1).Info("Pod", "Phase", pd.Status.Phase, "==", phase)
×
4659
        return gpuv1.Ready
×
4660
}
4661

4662
// SecurityContextConstraints creates SCC resources
4663
func SecurityContextConstraints(n ClusterPolicyController) (gpuv1.State, error) {
×
4664
        ctx := n.ctx
×
4665
        state := n.idx
×
4666
        obj := n.resources[state].SecurityContextConstraints.DeepCopy()
×
4667
        obj.Namespace = n.operatorNamespace
×
4668

×
4669
        logger := n.logger.WithValues("SecurityContextConstraints", obj.Name, "Namespace", "default")
×
4670

×
4671
        // Check if state is disabled and cleanup resource if exists
×
4672
        if !n.isStateEnabled(n.stateNames[n.idx]) {
×
4673
                err := n.client.Delete(ctx, obj)
×
4674
                if err != nil && !apierrors.IsNotFound(err) {
×
4675
                        logger.Info("Couldn't delete", "Error", err)
×
4676
                        return gpuv1.NotReady, err
×
4677
                }
×
4678
                return gpuv1.Disabled, nil
×
4679
        }
4680

4681
        for idx := range obj.Users {
×
4682
                if obj.Users[idx] != "FILLED BY THE OPERATOR" {
×
4683
                        continue
×
4684
                }
4685
                obj.Users[idx] = fmt.Sprintf("system:serviceaccount:%s:%s", obj.Namespace, obj.Name)
×
4686
        }
4687

4688
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
×
4689
                return gpuv1.NotReady, err
×
4690
        }
×
4691

4692
        found := &secv1.SecurityContextConstraints{}
×
4693
        err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found)
×
4694
        if err != nil && apierrors.IsNotFound(err) {
×
4695
                logger.Info("Not found, creating...")
×
4696
                err = n.client.Create(ctx, obj)
×
4697
                if err != nil {
×
4698
                        logger.Info("Couldn't create", "Error", err)
×
4699
                        return gpuv1.NotReady, err
×
4700
                }
×
4701
                return gpuv1.Ready, nil
×
4702
        } else if err != nil {
×
4703
                return gpuv1.NotReady, err
×
4704
        }
×
4705

4706
        logger.Info("Found Resource, updating...")
×
4707
        obj.ResourceVersion = found.ResourceVersion
×
4708

×
4709
        err = n.client.Update(ctx, obj)
×
4710
        if err != nil {
×
4711
                logger.Info("Couldn't update", "Error", err)
×
4712
                return gpuv1.NotReady, err
×
4713
        }
×
4714
        return gpuv1.Ready, nil
×
4715
}
4716

4717
// Service creates Service object
4718
func Service(n ClusterPolicyController) (gpuv1.State, error) {
1✔
4719
        ctx := n.ctx
1✔
4720
        state := n.idx
1✔
4721
        obj := n.resources[state].Service.DeepCopy()
1✔
4722

1✔
4723
        obj.Namespace = n.operatorNamespace
1✔
4724

1✔
4725
        logger := n.logger.WithValues("Service", obj.Name, "Namespace", obj.Namespace)
1✔
4726

1✔
4727
        // Check if state is disabled and cleanup resource if exists
1✔
4728
        if !n.isStateEnabled(n.stateNames[n.idx]) {
2✔
4729
                err := n.client.Delete(ctx, obj)
1✔
4730
                if err != nil && !apierrors.IsNotFound(err) {
1✔
4731
                        logger.Info("Couldn't delete", "Error", err)
×
4732
                        return gpuv1.NotReady, err
×
4733
                }
×
4734
                return gpuv1.Disabled, nil
1✔
4735
        }
4736

4737
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
4738
                return gpuv1.NotReady, err
×
4739
        }
×
4740

4741
        err := preprocessService(obj, n)
1✔
4742
        if err != nil {
1✔
4743
                logger.Info("Couldn't preprocess Service", "Error", err)
×
4744
                return gpuv1.NotReady, err
×
4745
        }
×
4746

4747
        found := &corev1.Service{}
1✔
4748
        err = n.client.Get(ctx, types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
1✔
4749
        if err != nil && apierrors.IsNotFound(err) {
2✔
4750
                logger.Info("Not found, creating...")
1✔
4751
                err = n.client.Create(ctx, obj)
1✔
4752
                if err != nil {
1✔
4753
                        logger.Info("Couldn't create", "Error", err)
×
4754
                        return gpuv1.NotReady, err
×
4755
                }
×
4756
                return gpuv1.Ready, nil
1✔
4757
        } else if err != nil {
1✔
4758
                return gpuv1.NotReady, err
×
4759
        }
×
4760

4761
        logger.Info("Found Resource, updating...")
1✔
4762
        obj.ResourceVersion = found.ResourceVersion
1✔
4763
        obj.Spec.ClusterIP = found.Spec.ClusterIP
1✔
4764

1✔
4765
        err = n.client.Update(ctx, obj)
1✔
4766
        if err != nil {
1✔
4767
                logger.Info("Couldn't update", "Error", err)
×
4768
                return gpuv1.NotReady, err
×
4769
        }
×
4770
        return gpuv1.Ready, nil
1✔
4771
}
4772

4773
func crdExists(n ClusterPolicyController, name string) (bool, error) {
1✔
4774
        crd := &apiextensionsv1.CustomResourceDefinition{}
1✔
4775
        err := n.client.Get(n.ctx, client.ObjectKey{Name: name}, crd)
1✔
4776
        if err != nil && apierrors.IsNotFound(err) {
2✔
4777
                return false, nil
1✔
4778
        } else if err != nil {
2✔
4779
                return false, err
×
4780
        }
×
4781

4782
        return true, nil
1✔
4783
}
4784

4785
// ServiceMonitor creates ServiceMonitor object
4786
func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) {
1✔
4787
        ctx := n.ctx
1✔
4788
        state := n.idx
1✔
4789
        obj := n.resources[state].ServiceMonitor.DeepCopy()
1✔
4790
        obj.Namespace = n.operatorNamespace
1✔
4791

1✔
4792
        logger := n.logger.WithValues("ServiceMonitor", obj.Name, "Namespace", obj.Namespace)
1✔
4793

1✔
4794
        // Check if ServiceMonitor is a valid kind
1✔
4795
        serviceMonitorCRDExists, err := crdExists(n, ServiceMonitorCRDName)
1✔
4796
        if err != nil {
1✔
4797
                return gpuv1.NotReady, err
×
4798
        }
×
4799

4800
        // Check if state is disabled and cleanup resource if exists
4801
        if !n.isStateEnabled(n.stateNames[state]) {
2✔
4802
                if !serviceMonitorCRDExists {
2✔
4803
                        return gpuv1.Ready, nil
1✔
4804
                }
1✔
4805
                err := n.client.Delete(ctx, obj)
1✔
4806
                if err != nil && !apierrors.IsNotFound(err) {
1✔
4807
                        logger.Info("Couldn't delete", "Error", err)
×
4808
                        return gpuv1.NotReady, err
×
4809
                }
×
4810
                return gpuv1.Disabled, nil
1✔
4811
        }
4812

4813
        if n.stateNames[state] == "state-dcgm-exporter" {
2✔
4814
                serviceMonitor := n.singleton.Spec.DCGMExporter.ServiceMonitor
1✔
4815
                // Check if ServiceMonitor is disabled and cleanup resource if exists
1✔
4816
                if serviceMonitor == nil || !serviceMonitor.IsEnabled() {
2✔
4817
                        if !serviceMonitorCRDExists {
2✔
4818
                                return gpuv1.Ready, nil
1✔
4819
                        }
1✔
4820
                        err := n.client.Delete(ctx, obj)
1✔
4821
                        if err != nil && !apierrors.IsNotFound(err) {
1✔
4822
                                logger.Info("Couldn't delete", "Error", err)
×
4823
                                return gpuv1.NotReady, err
×
4824
                        }
×
4825
                        return gpuv1.Disabled, nil
1✔
4826
                }
4827

4828
                if !serviceMonitorCRDExists {
2✔
4829
                        logger.Error(fmt.Errorf("couldn't find ServiceMonitor CRD"), "Install Prometheus and necessary CRDs for gathering GPU metrics!")
1✔
4830
                        return gpuv1.NotReady, nil
1✔
4831
                }
1✔
4832

4833
                // Apply custom edits for DCGM Exporter
4834
                if serviceMonitor.Interval != "" {
2✔
4835
                        obj.Spec.Endpoints[0].Interval = serviceMonitor.Interval
1✔
4836
                }
1✔
4837

4838
                if serviceMonitor.HonorLabels != nil {
2✔
4839
                        obj.Spec.Endpoints[0].HonorLabels = *serviceMonitor.HonorLabels
1✔
4840
                }
1✔
4841

4842
                if serviceMonitor.AdditionalLabels != nil {
2✔
4843
                        for key, value := range serviceMonitor.AdditionalLabels {
2✔
4844
                                obj.Labels[key] = value
1✔
4845
                        }
1✔
4846
                }
4847
                if serviceMonitor.Relabelings != nil {
2✔
4848
                        relabelConfigs := make([]promv1.RelabelConfig, len(serviceMonitor.Relabelings))
1✔
4849
                        for i, relabel := range serviceMonitor.Relabelings {
2✔
4850
                                if relabel != nil {
2✔
4851
                                        relabelConfigs[i] = *relabel
1✔
4852
                                }
1✔
4853
                        }
4854
                        obj.Spec.Endpoints[0].RelabelConfigs = relabelConfigs
1✔
4855
                }
4856
        }
4857
        if n.stateNames[state] == "state-operator-metrics" || n.stateNames[state] == "state-node-status-exporter" {
2✔
4858
                // if ServiceMonitor CRD is missing, assume prometheus is not setup and ignore CR creation
1✔
4859
                if !serviceMonitorCRDExists {
2✔
4860
                        logger.V(1).Info("ServiceMonitor CRD is missing, ignoring creation of CR for operator-metrics")
1✔
4861
                        return gpuv1.Ready, nil
1✔
4862
                }
1✔
4863
                obj.Spec.NamespaceSelector.MatchNames = []string{obj.Namespace}
×
4864
        }
4865

4866
        for idx := range obj.Spec.NamespaceSelector.MatchNames {
2✔
4867
                if obj.Spec.NamespaceSelector.MatchNames[idx] != "FILLED BY THE OPERATOR" {
1✔
4868
                        continue
×
4869
                }
4870
                obj.Spec.NamespaceSelector.MatchNames[idx] = obj.Namespace
1✔
4871
        }
4872

4873
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
4874
                return gpuv1.NotReady, err
×
4875
        }
×
4876

4877
        found := &promv1.ServiceMonitor{}
1✔
4878
        err = n.client.Get(ctx, types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
1✔
4879
        if err != nil && apierrors.IsNotFound(err) {
2✔
4880
                logger.Info("Not found, creating...")
1✔
4881
                err = n.client.Create(ctx, obj)
1✔
4882
                if err != nil {
1✔
4883
                        logger.Info("Couldn't create", "Error", err)
×
4884
                        return gpuv1.NotReady, err
×
4885
                }
×
4886
                return gpuv1.Ready, nil
1✔
4887
        } else if err != nil {
×
4888
                return gpuv1.NotReady, err
×
4889
        }
×
4890

4891
        logger.Info("Found Resource, updating...")
×
4892
        obj.ResourceVersion = found.ResourceVersion
×
4893

×
4894
        err = n.client.Update(ctx, obj)
×
4895
        if err != nil {
×
4896
                logger.Info("Couldn't update", "Error", err)
×
4897
                return gpuv1.NotReady, err
×
4898
        }
×
4899
        return gpuv1.Ready, nil
×
4900
}
4901

4902
func transformRuntimeClassLegacy(n ClusterPolicyController, spec nodev1.RuntimeClass) (gpuv1.State, error) {
×
4903
        ctx := n.ctx
×
4904
        obj := &nodev1beta1.RuntimeClass{}
×
4905

×
4906
        obj.Name = spec.Name
×
4907
        obj.Handler = spec.Handler
×
4908

×
4909
        // apply runtime class name as per ClusterPolicy
×
4910
        if obj.Name == "FILLED_BY_OPERATOR" {
×
4911
                runtimeClassName := getRuntimeClassName(&n.singleton.Spec)
×
4912
                obj.Name = runtimeClassName
×
4913
                obj.Handler = runtimeClassName
×
4914
        }
×
4915

4916
        obj.Labels = spec.Labels
×
4917

×
4918
        logger := n.logger.WithValues("RuntimeClass", obj.Name)
×
4919

×
4920
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
×
4921
                return gpuv1.NotReady, err
×
4922
        }
×
4923

4924
        found := &nodev1beta1.RuntimeClass{}
×
4925
        err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found)
×
4926
        if err != nil && apierrors.IsNotFound(err) {
×
4927
                logger.Info("Not found, creating...")
×
4928
                err = n.client.Create(ctx, obj)
×
4929
                if err != nil {
×
4930
                        logger.Info("Couldn't create", "Error", err)
×
4931
                        return gpuv1.NotReady, err
×
4932
                }
×
4933
                return gpuv1.Ready, nil
×
4934
        } else if err != nil {
×
4935
                return gpuv1.NotReady, err
×
4936
        }
×
4937

4938
        logger.Info("Found Resource, updating...")
×
4939
        obj.ResourceVersion = found.ResourceVersion
×
4940

×
4941
        err = n.client.Update(ctx, obj)
×
4942
        if err != nil {
×
4943
                logger.Info("Couldn't update", "Error", err)
×
4944
                return gpuv1.NotReady, err
×
4945
        }
×
4946
        return gpuv1.Ready, nil
×
4947
}
4948

4949
func transformRuntimeClass(n ClusterPolicyController, spec nodev1.RuntimeClass) (gpuv1.State, error) {
×
4950
        ctx := n.ctx
×
4951
        obj := &nodev1.RuntimeClass{}
×
4952

×
4953
        obj.Name = spec.Name
×
4954
        obj.Handler = spec.Handler
×
4955

×
4956
        // apply runtime class name as per ClusterPolicy
×
4957
        if obj.Name == "FILLED_BY_OPERATOR" {
×
4958
                runtimeClassName := getRuntimeClassName(&n.singleton.Spec)
×
4959
                obj.Name = runtimeClassName
×
4960
                obj.Handler = runtimeClassName
×
4961
        }
×
4962

4963
        obj.Labels = spec.Labels
×
4964

×
4965
        logger := n.logger.WithValues("RuntimeClass", obj.Name)
×
4966

×
4967
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
×
4968
                return gpuv1.NotReady, err
×
4969
        }
×
4970

4971
        found := &nodev1.RuntimeClass{}
×
4972
        err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found)
×
4973
        if err != nil && apierrors.IsNotFound(err) {
×
4974
                logger.Info("Not found, creating...")
×
4975
                err = n.client.Create(ctx, obj)
×
4976
                if err != nil {
×
4977
                        logger.Info("Couldn't create", "Error", err)
×
4978
                        return gpuv1.NotReady, err
×
4979
                }
×
4980
                return gpuv1.Ready, nil
×
4981
        } else if err != nil {
×
4982
                return gpuv1.NotReady, err
×
4983
        }
×
4984

4985
        logger.Info("Found Resource, updating...")
×
4986
        obj.ResourceVersion = found.ResourceVersion
×
4987

×
4988
        err = n.client.Update(ctx, obj)
×
4989
        if err != nil {
×
4990
                logger.Info("Couldn't update", "Error", err)
×
4991
                return gpuv1.NotReady, err
×
4992
        }
×
4993
        return gpuv1.Ready, nil
×
4994
}
4995

4996
func transformKataRuntimeClasses(n ClusterPolicyController) (gpuv1.State, error) {
×
4997
        ctx := n.ctx
×
4998
        state := n.idx
×
4999
        config := n.singleton.Spec
×
5000

×
5001
        // Get all existing Kata RuntimeClasses
×
5002
        opts := []client.ListOption{&client.MatchingLabels{"nvidia.com/kata-runtime-class": "true"}}
×
5003
        list := &nodev1.RuntimeClassList{}
×
5004
        err := n.client.List(ctx, list, opts...)
×
5005
        if err != nil {
×
5006
                n.logger.Info("Could not get Kata RuntimeClassList", err)
×
5007
                return gpuv1.NotReady, fmt.Errorf("error getting kata RuntimeClassList: %v", err)
×
5008
        }
×
5009
        n.logger.V(1).Info("Kata RuntimeClasses", "Number", len(list.Items))
×
5010

×
5011
        if !config.KataManager.IsEnabled() {
×
5012
                // Delete all Kata RuntimeClasses
×
5013
                n.logger.Info("Kata Manager disabled, deleting all Kata RuntimeClasses")
×
5014
                for _, rc := range list.Items {
×
5015
                        rc := rc
×
5016
                        n.logger.V(1).Info("Deleting Kata RuntimeClass", "Name", rc.Name)
×
5017
                        err := n.client.Delete(ctx, &rc)
×
5018
                        if err != nil {
×
5019
                                return gpuv1.NotReady, fmt.Errorf("error deleting kata RuntimeClass '%s': %v", rc.Name, err)
×
5020
                        }
×
5021
                }
5022
                return gpuv1.Ready, nil
×
5023
        }
5024

5025
        // Get names of desired kata RuntimeClasses
5026
        rcNames := make(map[string]struct{})
×
5027
        for _, rc := range config.KataManager.Config.RuntimeClasses {
×
5028
                rcNames[rc.Name] = struct{}{}
×
5029
        }
×
5030

5031
        // Delete any existing Kata RuntimeClasses that are no longer specified in KataManager configuration
5032
        for _, rc := range list.Items {
×
5033
                if _, ok := rcNames[rc.Name]; !ok {
×
5034
                        rc := rc
×
5035
                        n.logger.Info("Deleting Kata RuntimeClass", "Name", rc.Name)
×
5036
                        err := n.client.Delete(ctx, &rc)
×
5037
                        if err != nil {
×
5038
                                return gpuv1.NotReady, fmt.Errorf("error deleting kata RuntimeClass '%s': %v", rc.Name, err)
×
5039
                        }
×
5040
                }
5041
        }
5042

5043
        // Using kata RuntimClass template, create / update RuntimeClass objects specified in KataManager configuration
5044
        template := n.resources[state].RuntimeClasses[0]
×
5045
        for _, rc := range config.KataManager.Config.RuntimeClasses {
×
5046
                logger := n.logger.WithValues("RuntimeClass", rc.Name)
×
5047

×
5048
                if rc.Name == config.Operator.RuntimeClass {
×
5049
                        return gpuv1.NotReady, fmt.Errorf("error creating kata runtimeclass '%s' as it conflicts with the runtimeclass used for the gpu-operator operand pods itself", rc.Name)
×
5050
                }
×
5051

5052
                obj := nodev1.RuntimeClass{}
×
5053
                obj.Name = rc.Name
×
5054
                obj.Handler = rc.Name
×
5055
                obj.Labels = template.Labels
×
5056
                obj.Scheduling = &nodev1.Scheduling{}
×
5057
                nodeSelector := make(map[string]string)
×
5058
                for k, v := range template.Scheduling.NodeSelector {
×
5059
                        nodeSelector[k] = v
×
5060
                }
×
5061
                if rc.NodeSelector != nil {
×
5062
                        // append user provided selectors to default nodeSelector
×
5063
                        for k, v := range rc.NodeSelector {
×
5064
                                nodeSelector[k] = v
×
5065
                        }
×
5066
                }
5067
                obj.Scheduling.NodeSelector = nodeSelector
×
5068

×
5069
                if err := controllerutil.SetControllerReference(n.singleton, &obj, n.scheme); err != nil {
×
5070
                        return gpuv1.NotReady, err
×
5071
                }
×
5072

5073
                found := &nodev1.RuntimeClass{}
×
5074
                err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found)
×
5075
                if err != nil && apierrors.IsNotFound(err) {
×
5076
                        logger.Info("Not found, creating...")
×
5077
                        err = n.client.Create(ctx, &obj)
×
5078
                        if err != nil {
×
5079
                                logger.Info("Couldn't create", "Error", err)
×
5080
                                return gpuv1.NotReady, err
×
5081
                        }
×
5082
                        continue
×
5083
                } else if err != nil {
×
5084
                        return gpuv1.NotReady, err
×
5085
                }
×
5086

5087
                logger.Info("Found Resource, updating...")
×
5088
                obj.ResourceVersion = found.ResourceVersion
×
5089

×
5090
                err = n.client.Update(ctx, &obj)
×
5091
                if err != nil {
×
5092
                        logger.Info("Couldn't update", "Error", err)
×
5093
                        return gpuv1.NotReady, err
×
5094
                }
×
5095
        }
5096
        return gpuv1.Ready, nil
×
5097
}
5098

5099
func RuntimeClasses(n ClusterPolicyController) (gpuv1.State, error) {
×
5100
        status := gpuv1.Ready
×
5101
        state := n.idx
×
5102

×
5103
        if n.stateNames[state] == "state-kata-manager" {
×
5104
                return transformKataRuntimeClasses(n)
×
5105
        }
×
5106

5107
        createRuntimeClassFunc := transformRuntimeClass
×
5108
        if semver.Compare(n.k8sVersion, nodev1MinimumAPIVersion) <= 0 {
×
5109
                createRuntimeClassFunc = transformRuntimeClassLegacy
×
5110
        }
×
5111

5112
        for _, obj := range n.resources[state].RuntimeClasses {
×
5113
                obj := obj
×
5114
                // When CDI is disabled, do not create the additional 'nvidia-cdi' and
×
5115
                // 'nvidia-legacy' runtime classes. Delete these objects if they were
×
5116
                // previously created.
×
5117
                if !n.singleton.Spec.CDI.IsEnabled() && (obj.Name == "nvidia-cdi" || obj.Name == "nvidia-legacy") {
×
5118
                        err := n.client.Delete(n.ctx, &obj)
×
5119
                        if err != nil && !apierrors.IsNotFound(err) {
×
5120
                                n.logger.Info("Couldn't delete", "RuntimeClass", obj.Name, "Error", err)
×
5121
                                return gpuv1.NotReady, err
×
5122
                        }
×
5123
                        continue
×
5124
                }
5125
                stat, err := createRuntimeClassFunc(n, obj)
×
5126
                if err != nil {
×
5127
                        return stat, err
×
5128
                }
×
5129
                if stat != gpuv1.Ready {
×
5130
                        status = gpuv1.NotReady
×
5131
                }
×
5132
        }
5133
        return status, nil
×
5134
}
5135

5136
// PrometheusRule creates PrometheusRule object
5137
func PrometheusRule(n ClusterPolicyController) (gpuv1.State, error) {
×
5138
        ctx := n.ctx
×
5139
        state := n.idx
×
5140
        obj := n.resources[state].PrometheusRule.DeepCopy()
×
5141
        obj.Namespace = n.operatorNamespace
×
5142

×
5143
        logger := n.logger.WithValues("PrometheusRule", obj.Name)
×
5144

×
5145
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
×
5146
                return gpuv1.NotReady, err
×
5147
        }
×
5148

5149
        found := &promv1.PrometheusRule{}
×
5150
        err := n.client.Get(ctx, types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
×
5151
        if err != nil && apierrors.IsNotFound(err) {
×
5152
                logger.Info("Not found, creating...")
×
5153
                err = n.client.Create(ctx, obj)
×
5154
                if err != nil {
×
5155
                        logger.Info("Couldn't create", "Error", err)
×
5156
                        return gpuv1.NotReady, err
×
5157
                }
×
5158
                return gpuv1.Ready, nil
×
5159
        } else if err != nil {
×
5160
                return gpuv1.NotReady, err
×
5161
        }
×
5162

5163
        logger.Info("Found Resource, updating...")
×
5164
        obj.ResourceVersion = found.ResourceVersion
×
5165

×
5166
        err = n.client.Update(ctx, obj)
×
5167
        if err != nil {
×
5168
                logger.Info("Couldn't update", "Error", err)
×
5169
                return gpuv1.NotReady, err
×
5170
        }
×
5171
        return gpuv1.Ready, nil
×
5172
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc