• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

NVIDIA / gpu-operator / 16360201712

18 Jul 2025 01:50AM UTC coverage: 18.589% (+0.2%) from 18.433%
16360201712

Pull #1541

github

cdesiniotis
Integrate NVIDIA DRA Driver for GPUs as an operand

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
Pull Request #1541: Integrate NVIDIA DRA Driver for GPUs as a operand

69 of 276 new or added lines in 6 files covered. (25.0%)

1227 existing lines in 4 files now uncovered.

2179 of 11722 relevant lines covered (18.59%)

0.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

38.13
/controllers/object_controls.go
1
/**
2
# Copyright (c) NVIDIA CORPORATION.  All rights reserved.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
**/
16

17
package controllers
18

19
import (
20
        "bufio"
21
        "context"
22
        "errors"
23
        "fmt"
24
        "os"
25
        "path"
26
        "regexp"
27
        "sort"
28
        "strconv"
29
        "strings"
30

31
        "path/filepath"
32

33
        apiconfigv1 "github.com/openshift/api/config/v1"
34
        apiimagev1 "github.com/openshift/api/image/v1"
35
        secv1 "github.com/openshift/api/security/v1"
36
        promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
37
        "golang.org/x/mod/semver"
38
        appsv1 "k8s.io/api/apps/v1"
39
        corev1 "k8s.io/api/core/v1"
40
        nodev1 "k8s.io/api/node/v1"
41
        nodev1beta1 "k8s.io/api/node/v1beta1"
42
        resourceapi "k8s.io/api/resource/v1beta1"
43
        apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
44
        apierrors "k8s.io/apimachinery/pkg/api/errors"
45
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
46
        "k8s.io/apimachinery/pkg/runtime/schema"
47
        "k8s.io/apimachinery/pkg/types"
48
        "k8s.io/apimachinery/pkg/util/intstr"
49
        "sigs.k8s.io/controller-runtime/pkg/client"
50
        "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
51
        "sigs.k8s.io/yaml"
52

53
        gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
54
        "github.com/NVIDIA/gpu-operator/internal/consts"
55
        "github.com/NVIDIA/gpu-operator/internal/utils"
56
)
57

58
const (
59
        // DefaultContainerdConfigFile indicates default config file path for containerd
60
        DefaultContainerdConfigFile = "/etc/containerd/config.toml"
61
        // DefaultContainerdSocketFile indicates default containerd socket file
62
        DefaultContainerdSocketFile = "/run/containerd/containerd.sock"
63
        // DefaultDockerConfigFile indicates default config file path for docker
64
        DefaultDockerConfigFile = "/etc/docker/daemon.json"
65
        // DefaultDockerSocketFile indicates default docker socket file
66
        DefaultDockerSocketFile = "/var/run/docker.sock"
67
        // DefaultCRIOConfigFile indicates default config file path for cri-o.
68
        // Note, config files in the drop-in directory, /etc/crio/crio.conf.d,
69
        // have a higher priority than the default /etc/crio/crio.conf file.
70
        DefaultCRIOConfigFile = "/etc/crio/crio.conf.d/99-nvidia.conf"
71
        // TrustedCAConfigMapName indicates configmap with custom user CA injected
72
        TrustedCAConfigMapName = "gpu-operator-trusted-ca"
73
        // TrustedCABundleFileName indicates custom user ca certificate filename
74
        TrustedCABundleFileName = "ca-bundle.crt"
75
        // TrustedCABundleMountDir indicates target mount directory of user ca bundle
76
        TrustedCABundleMountDir = "/etc/pki/ca-trust/extracted/pem"
77
        // TrustedCACertificate indicates injected CA certificate name
78
        TrustedCACertificate = "tls-ca-bundle.pem"
79
        // VGPULicensingConfigMountPath indicates target mount path for vGPU licensing configuration file
80
        VGPULicensingConfigMountPath = "/drivers/gridd.conf"
81
        // VGPULicensingFileName is the vGPU licensing configuration filename
82
        VGPULicensingFileName = "gridd.conf"
83
        // NLSClientTokenMountPath inidicates the target mount path for NLS client config token file (.tok)
84
        NLSClientTokenMountPath = "/drivers/ClientConfigToken/client_configuration_token.tok"
85
        // NLSClientTokenFileName is the NLS client config token filename
86
        NLSClientTokenFileName = "client_configuration_token.tok"
87
        // VGPUTopologyConfigMountPath indicates target mount path for vGPU topology daemon configuration file
88
        VGPUTopologyConfigMountPath = "/etc/nvidia/nvidia-topologyd.conf"
89
        // VGPUTopologyConfigFileName is the vGPU topology daemon configuration filename
90
        VGPUTopologyConfigFileName = "nvidia-topologyd.conf"
91
        // DefaultRuntimeClass represents "nvidia" RuntimeClass
92
        DefaultRuntimeClass = "nvidia"
93
        // DriverInstallPathVolName represents volume name for driver install path provided to toolkit
94
        DriverInstallPathVolName = "driver-install-path"
95
        // DefaultRuntimeSocketTargetDir represents target directory where runtime socket dirctory will be mounted
96
        DefaultRuntimeSocketTargetDir = "/runtime/sock-dir/"
97
        // DefaultRuntimeConfigTargetDir represents target directory where runtime socket dirctory will be mounted
98
        DefaultRuntimeConfigTargetDir = "/runtime/config-dir/"
99
        // ValidatorImageEnvName indicates env name for validator image passed
100
        ValidatorImageEnvName = "VALIDATOR_IMAGE"
101
        // ValidatorImagePullPolicyEnvName indicates env name for validator image pull policy passed
102
        ValidatorImagePullPolicyEnvName = "VALIDATOR_IMAGE_PULL_POLICY"
103
        // ValidatorImagePullSecretsEnvName indicates env name for validator image pull secrets passed
104
        ValidatorImagePullSecretsEnvName = "VALIDATOR_IMAGE_PULL_SECRETS"
105
        // ValidatorRuntimeClassEnvName indicates env name of runtime class to be applied to validator pods
106
        ValidatorRuntimeClassEnvName = "VALIDATOR_RUNTIME_CLASS"
107
        // MigStrategyEnvName indicates env name for passing MIG strategy
108
        MigStrategyEnvName = "MIG_STRATEGY"
109
        // MigPartedDefaultConfigMapName indicates name of ConfigMap containing default mig-parted config
110
        MigPartedDefaultConfigMapName = "default-mig-parted-config"
111
        // MigDefaultGPUClientsConfigMapName indicates name of ConfigMap containing default gpu-clients
112
        MigDefaultGPUClientsConfigMapName = "default-gpu-clients"
113
        // DCGMRemoteEngineEnvName indicates env name to specify remote DCGM host engine ip:port
114
        DCGMRemoteEngineEnvName = "DCGM_REMOTE_HOSTENGINE_INFO"
115
        // DCGMDefaultPort indicates default port bound to DCGM host engine
116
        DCGMDefaultPort = 5555
117
        // GPUDirectRDMAEnabledEnvName indicates if GPU direct RDMA is enabled through GPU operator
118
        GPUDirectRDMAEnabledEnvName = "GPU_DIRECT_RDMA_ENABLED"
119
        // UseHostMOFEDEnvName indicates if MOFED driver is pre-installed on the host
120
        UseHostMOFEDEnvName = "USE_HOST_MOFED"
121
        // MetricsConfigMountPath indicates mount path for custom dcgm metrics file
122
        MetricsConfigMountPath = "/etc/dcgm-exporter/" + MetricsConfigFileName
123
        // MetricsConfigFileName indicates custom dcgm metrics file name
124
        MetricsConfigFileName = "dcgm-metrics.csv"
125
        // NvidiaAnnotationHashKey indicates annotation name for last applied hash by gpu-operator
126
        NvidiaAnnotationHashKey = "nvidia.com/last-applied-hash"
127
        // NvidiaDisableRequireEnvName is the env name to disable default cuda constraints
128
        NvidiaDisableRequireEnvName = "NVIDIA_DISABLE_REQUIRE"
129
        // GDSEnabledEnvName is the env name to enable GDS support with device-plugin
130
        GDSEnabledEnvName = "GDS_ENABLED"
131
        // MOFEDEnabledEnvName is the env name to enable MOFED devices injection with device-plugin
132
        MOFEDEnabledEnvName = "MOFED_ENABLED"
133
        // ServiceMonitorCRDName is the name of the CRD defining the ServiceMonitor kind
134
        ServiceMonitorCRDName = "servicemonitors.monitoring.coreos.com"
135
        // DefaultToolkitInstallDir is the default toolkit installation directory on the host
136
        DefaultToolkitInstallDir = "/usr/local/nvidia"
137
        // ToolkitInstallDirEnvName is the name of the toolkit container env for configuring where NVIDIA Container Toolkit is installed
138
        ToolkitInstallDirEnvName = "ROOT"
139
        // VgpuDMDefaultConfigMapName indicates name of ConfigMap containing default vGPU devices configuration
140
        VgpuDMDefaultConfigMapName = "default-vgpu-devices-config"
141
        // VgpuDMDefaultConfigName indicates name of default configuration in the vGPU devices config file
142
        VgpuDMDefaultConfigName = "default"
143
        // NvidiaCtrRuntimeModeEnvName is the name of the toolkit container env for configuring the NVIDIA Container Runtime mode
144
        NvidiaCtrRuntimeModeEnvName = "NVIDIA_CONTAINER_RUNTIME_MODE"
145
        // NvidiaCtrRuntimeCDIPrefixesEnvName is the name of toolkit container env for configuring the CDI annotation prefixes
146
        NvidiaCtrRuntimeCDIPrefixesEnvName = "NVIDIA_CONTAINER_RUNTIME_MODES_CDI_ANNOTATION_PREFIXES"
147
        // CDIEnabledEnvName is the name of the envvar used to enable CDI in the operands
148
        CDIEnabledEnvName = "CDI_ENABLED"
149
        // NvidiaCTKPathEnvName is the name of the envvar specifying the path to the 'nvidia-ctk' binary
150
        NvidiaCTKPathEnvName = "NVIDIA_CTK_PATH"
151
        // NvidiaCDIHookPathEnvName is the name of the envvar specifying the path to the 'nvidia-cdi-hook' binary
152
        NvidiaCDIHookPathEnvName = "NVIDIA_CDI_HOOK_PATH"
153
        // CrioConfigModeEnvName is the name of the envvar controlling how the toolkit container updates the cri-o configuration
154
        CrioConfigModeEnvName = "CRIO_CONFIG_MODE"
155
        // DeviceListStrategyEnvName is the name of the envvar for configuring the device-list-strategy in the device-plugin
156
        DeviceListStrategyEnvName = "DEVICE_LIST_STRATEGY"
157
        // CDIAnnotationPrefixEnvName is the name of the device-plugin envvar for configuring the CDI annotation prefix
158
        CDIAnnotationPrefixEnvName = "CDI_ANNOTATION_PREFIX"
159
        // KataManagerAnnotationHashKey is the annotation indicating the hash of the kata-manager configuration
160
        KataManagerAnnotationHashKey = "nvidia.com/kata-manager.last-applied-hash"
161
        // DefaultKataArtifactsDir is the default directory to store kata artifacts on the host
162
        DefaultKataArtifactsDir = "/opt/nvidia-gpu-operator/artifacts/runtimeclasses/"
163
        // PodControllerRevisionHashLabelKey is the annotation key for pod controller revision hash value
164
        PodControllerRevisionHashLabelKey = "controller-revision-hash"
165
        // DefaultCCModeEnvName is the name of the envvar for configuring default CC mode on all compatible GPUs on the node
166
        DefaultCCModeEnvName = "DEFAULT_CC_MODE"
167
        // OpenKernelModulesEnabledEnvName is the name of the driver-container envvar for enabling open GPU kernel module support
168
        OpenKernelModulesEnabledEnvName = "OPEN_KERNEL_MODULES_ENABLED"
169
        // KernelModuleTypeEnvName is the name of the driver-container envvar to set the desired kernel module type
170
        KernelModuleTypeEnvName = "KERNEL_MODULE_TYPE"
171
        // MPSRootEnvName is the name of the envvar for configuring the MPS root
172
        MPSRootEnvName = "MPS_ROOT"
173
        // DefaultMPSRoot is the default MPS root path on the host
174
        DefaultMPSRoot = "/run/nvidia/mps"
175
        // HostRootEnvName is the name of the envvar representing the root path of the underlying host
176
        HostRootEnvName = "HOST_ROOT"
177
        // DefaultDriverInstallDir represents the default path of a driver container installation
178
        DefaultDriverInstallDir = "/run/nvidia/driver"
179
        // DriverInstallDirEnvName is the name of the envvar used by the driver-validator to represent the driver install dir
180
        DriverInstallDirEnvName = "DRIVER_INSTALL_DIR"
181
        // DriverInstallDirCtrPathEnvName is the name of the envvar used by the driver-validator to represent the path
182
        // of the driver install dir mounted in the container
183
        DriverInstallDirCtrPathEnvName = "DRIVER_INSTALL_DIR_CTR_PATH"
184
)
185

186
// ContainerProbe defines container probe types
187
type ContainerProbe string
188

189
const (
190
        // Startup probe
191
        Startup ContainerProbe = "startup"
192
        // Liveness probe
193
        Liveness ContainerProbe = "liveness"
194
        // Readiness probe
195
        Readiness ContainerProbe = "readiness"
196
)
197

198
// rootUID represents user 0
199
var rootUID = utils.Int64Ptr(0)
200

201
// RepoConfigPathMap indicates standard OS specific paths for repository configuration files
202
var RepoConfigPathMap = map[string]string{
203
        "centos": "/etc/yum.repos.d",
204
        "ubuntu": "/etc/apt/sources.list.d",
205
        "rhcos":  "/etc/yum.repos.d",
206
        "rhel":   "/etc/yum.repos.d",
207
}
208

209
// CertConfigPathMap indicates standard OS specific paths for ssl keys/certificates.
210
// Where Go looks for certs: https://golang.org/src/crypto/x509/root_linux.go
211
// Where OCP mounts proxy certs on RHCOS nodes:
212
// https://access.redhat.com/documentation/en-us/openshift_container_platform/4.3/html/authentication/ocp-certificates#proxy-certificates_ocp-certificates
213
var CertConfigPathMap = map[string]string{
214
        "centos": "/etc/pki/ca-trust/extracted/pem",
215
        "ubuntu": "/usr/local/share/ca-certificates",
216
        "rhcos":  "/etc/pki/ca-trust/extracted/pem",
217
        "rhel":   "/etc/pki/ca-trust/extracted/pem",
218
}
219

220
func newHostPathType(pathType corev1.HostPathType) *corev1.HostPathType {
1✔
221
        hostPathType := new(corev1.HostPathType)
1✔
222
        *hostPathType = pathType
1✔
223
        return hostPathType
1✔
224
}
1✔
225

226
// MountPathToVolumeSource maps a container mount path to a VolumeSource
227
type MountPathToVolumeSource map[string]corev1.VolumeSource
228

229
// SubscriptionPathMap contains information on OS-specific paths
230
// that provide entitlements/subscription details on the host.
231
// These are used to enable Driver Container's access to packages controlled by
232
// the distro through their subscription and support program.
233
var SubscriptionPathMap = map[string](MountPathToVolumeSource){
234
        "rhel": {
235
                "/run/secrets/etc-pki-entitlement": corev1.VolumeSource{
236
                        HostPath: &corev1.HostPathVolumeSource{
237
                                Path: "/etc/pki/entitlement",
238
                                Type: newHostPathType(corev1.HostPathDirectory),
239
                        },
240
                },
241
                "/run/secrets/redhat.repo": corev1.VolumeSource{
242
                        HostPath: &corev1.HostPathVolumeSource{
243
                                Path: "/etc/yum.repos.d/redhat.repo",
244
                                Type: newHostPathType(corev1.HostPathFile),
245
                        },
246
                },
247
                "/run/secrets/rhsm": corev1.VolumeSource{
248
                        HostPath: &corev1.HostPathVolumeSource{
249
                                Path: "/etc/rhsm",
250
                                Type: newHostPathType(corev1.HostPathDirectory),
251
                        },
252
                },
253
        },
254
        "rhcos": {
255
                "/run/secrets/etc-pki-entitlement": corev1.VolumeSource{
256
                        HostPath: &corev1.HostPathVolumeSource{
257
                                Path: "/etc/pki/entitlement",
258
                                Type: newHostPathType(corev1.HostPathDirectory),
259
                        },
260
                },
261
                "/run/secrets/redhat.repo": corev1.VolumeSource{
262
                        HostPath: &corev1.HostPathVolumeSource{
263
                                Path: "/etc/yum.repos.d/redhat.repo",
264
                                Type: newHostPathType(corev1.HostPathFile),
265
                        },
266
                },
267
                "/run/secrets/rhsm": corev1.VolumeSource{
268
                        HostPath: &corev1.HostPathVolumeSource{
269
                                Path: "/etc/rhsm",
270
                                Type: newHostPathType(corev1.HostPathDirectory),
271
                        },
272
                },
273
        },
274
        "sles": {
275
                "/etc/zypp/credentials.d": corev1.VolumeSource{
276
                        HostPath: &corev1.HostPathVolumeSource{
277
                                Path: "/etc/zypp/credentials.d",
278
                                Type: newHostPathType(corev1.HostPathDirectory),
279
                        },
280
                },
281
                "/etc/SUSEConnect": corev1.VolumeSource{
282
                        HostPath: &corev1.HostPathVolumeSource{
283
                                Path: "/etc/SUSEConnect",
284
                                Type: newHostPathType(corev1.HostPathFileOrCreate),
285
                        },
286
                },
287
        },
288
        "sl-micro": {
289
                "/etc/zypp/credentials.d": corev1.VolumeSource{
290
                        HostPath: &corev1.HostPathVolumeSource{
291
                                Path: "/etc/zypp/credentials.d",
292
                                Type: newHostPathType(corev1.HostPathDirectory),
293
                        },
294
                },
295
                "/etc/SUSEConnect": corev1.VolumeSource{
296
                        HostPath: &corev1.HostPathVolumeSource{
297
                                Path: "/etc/SUSEConnect",
298
                                Type: newHostPathType(corev1.HostPathFileOrCreate),
299
                        },
300
                },
301
        },
302
}
303

304
type controlFunc []func(n ClusterPolicyController) (gpuv1.State, error)
305

306
// ServiceAccount creates ServiceAccount resource
307
func ServiceAccount(n ClusterPolicyController) (gpuv1.State, error) {
1✔
308
        ctx := n.ctx
1✔
309
        state := n.idx
1✔
310
        obj := n.resources[state].ServiceAccount.DeepCopy()
1✔
311
        obj.Namespace = n.operatorNamespace
1✔
312

1✔
313
        logger := n.logger.WithValues("ServiceAccount", obj.Name, "Namespace", obj.Namespace)
1✔
314

1✔
315
        // Check if state is disabled and cleanup resource if exists
1✔
316
        if !n.isStateEnabled(n.stateNames[n.idx]) {
1✔
317
                err := n.client.Delete(ctx, obj)
×
318
                if err != nil && !apierrors.IsNotFound(err) {
×
319
                        logger.Info("Couldn't delete", "Error", err)
×
320
                        return gpuv1.NotReady, err
×
321
                }
×
322
                return gpuv1.Disabled, nil
×
323
        }
324

325
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
326
                return gpuv1.NotReady, err
×
327
        }
×
328

329
        if err := n.client.Create(ctx, obj); err != nil {
1✔
330
                if apierrors.IsAlreadyExists(err) {
×
331
                        logger.Info("Found Resource, skipping update")
×
332
                        return gpuv1.Ready, nil
×
333
                }
×
334

335
                logger.Info("Couldn't create", "Error", err)
×
336
                return gpuv1.NotReady, err
×
337
        }
338
        return gpuv1.Ready, nil
1✔
339
}
340

341
// Role creates Role resource
342
func Role(n ClusterPolicyController) (gpuv1.State, error) {
1✔
343
        ctx := n.ctx
1✔
344
        state := n.idx
1✔
345
        obj := n.resources[state].Role.DeepCopy()
1✔
346
        obj.Namespace = n.operatorNamespace
1✔
347

1✔
348
        logger := n.logger.WithValues("Role", obj.Name, "Namespace", obj.Namespace)
1✔
349

1✔
350
        // Check if state is disabled and cleanup resource if exists
1✔
351
        if !n.isStateEnabled(n.stateNames[n.idx]) {
1✔
352
                err := n.client.Delete(ctx, obj)
×
353
                if err != nil && !apierrors.IsNotFound(err) {
×
354
                        logger.Info("Couldn't delete", "Error", err)
×
355
                        return gpuv1.NotReady, err
×
356
                }
×
UNCOV
357
                return gpuv1.Disabled, nil
×
358
        }
359

360
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
UNCOV
361
                return gpuv1.NotReady, err
×
UNCOV
362
        }
×
363

364
        if err := n.client.Create(ctx, obj); err != nil {
1✔
UNCOV
365
                if apierrors.IsAlreadyExists(err) {
×
UNCOV
366
                        logger.Info("Found Resource, updating...")
×
UNCOV
367
                        err = n.client.Update(ctx, obj)
×
368
                        if err != nil {
×
369
                                logger.Info("Couldn't update", "Error", err)
×
370
                                return gpuv1.NotReady, err
×
371
                        }
×
372
                        return gpuv1.Ready, nil
×
373
                }
374

UNCOV
375
                logger.Info("Couldn't create", "Error", err)
×
UNCOV
376
                return gpuv1.NotReady, err
×
377
        }
378

379
        return gpuv1.Ready, nil
1✔
380
}
381

382
// RoleBinding creates RoleBinding resource
383
func RoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
1✔
384
        ctx := n.ctx
1✔
385
        state := n.idx
1✔
386
        obj := n.resources[state].RoleBinding.DeepCopy()
1✔
387
        obj.Namespace = n.operatorNamespace
1✔
388

1✔
389
        logger := n.logger.WithValues("RoleBinding", obj.Name, "Namespace", obj.Namespace)
1✔
390

1✔
391
        // Check if state is disabled and cleanup resource if exists
1✔
392
        if !n.isStateEnabled(n.stateNames[n.idx]) {
1✔
UNCOV
393
                err := n.client.Delete(ctx, obj)
×
UNCOV
394
                if err != nil && !apierrors.IsNotFound(err) {
×
UNCOV
395
                        logger.Info("Couldn't delete", "Error", err)
×
UNCOV
396
                        return gpuv1.NotReady, err
×
UNCOV
397
                }
×
398
                return gpuv1.Disabled, nil
×
399
        }
400

401
        for idx := range obj.Subjects {
2✔
402
                // we don't want to update ALL the Subjects[].Namespace, eg we need to keep 'openshift-monitoring'
1✔
403
                // for allowing PrometheusOperator to scrape our metrics resources:
1✔
404
                // see in assets/state-dcgm-exporter, 0500_prom_rolebinding_openshift.yaml vs 0300_rolebinding.yaml
1✔
405
                if obj.Subjects[idx].Namespace != "FILLED BY THE OPERATOR" {
1✔
UNCOV
406
                        continue
×
407
                }
408
                obj.Subjects[idx].Namespace = n.operatorNamespace
1✔
409
        }
410

411
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
412
                return gpuv1.NotReady, err
×
413
        }
×
414

415
        if err := n.client.Create(ctx, obj); err != nil {
1✔
UNCOV
416
                if apierrors.IsAlreadyExists(err) {
×
UNCOV
417
                        logger.Info("Found Resource, updating...")
×
UNCOV
418
                        err = n.client.Update(ctx, obj)
×
UNCOV
419
                        if err != nil {
×
UNCOV
420
                                logger.Info("Couldn't update", "Error", err)
×
UNCOV
421
                                return gpuv1.NotReady, err
×
422
                        }
×
423
                        return gpuv1.Ready, nil
×
424
                }
425

426
                logger.Info("Couldn't create", "Error", err)
×
427
                return gpuv1.NotReady, err
×
428
        }
429

430
        return gpuv1.Ready, nil
1✔
431
}
432

433
// ClusterRole creates ClusterRole resource
434
func ClusterRole(n ClusterPolicyController) (gpuv1.State, error) {
1✔
435
        ctx := n.ctx
1✔
436
        state := n.idx
1✔
437
        obj := n.resources[state].ClusterRole.DeepCopy()
1✔
438
        obj.Namespace = n.operatorNamespace
1✔
439

1✔
440
        logger := n.logger.WithValues("ClusterRole", obj.Name, "Namespace", obj.Namespace)
1✔
441

1✔
442
        // Check if state is disabled and cleanup resource if exists
1✔
443
        if !n.isStateEnabled(n.stateNames[n.idx]) {
1✔
444
                err := n.client.Delete(ctx, obj)
×
445
                if err != nil && !apierrors.IsNotFound(err) {
×
446
                        logger.Info("Couldn't delete", "Error", err)
×
447
                        return gpuv1.NotReady, err
×
448
                }
×
449
                return gpuv1.Disabled, nil
×
450
        }
451

452
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
453
                return gpuv1.NotReady, err
×
454
        }
×
455

456
        if err := n.client.Create(ctx, obj); err != nil {
1✔
457
                if apierrors.IsAlreadyExists(err) {
×
458
                        logger.Info("Found Resource, updating...")
×
459
                        err = n.client.Update(ctx, obj)
×
460
                        if err != nil {
×
UNCOV
461
                                logger.Info("Couldn't update", "Error", err)
×
UNCOV
462
                                return gpuv1.NotReady, err
×
463
                        }
×
UNCOV
464
                        return gpuv1.Ready, nil
×
465
                }
466

UNCOV
467
                logger.Info("Couldn't create", "Error", err)
×
UNCOV
468
                return gpuv1.NotReady, err
×
469
        }
470

471
        return gpuv1.Ready, nil
1✔
472
}
473

474
// ClusterRoleBinding creates ClusterRoleBinding resource
475
func ClusterRoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
1✔
476
        ctx := n.ctx
1✔
477
        state := n.idx
1✔
478
        obj := n.resources[state].ClusterRoleBinding.DeepCopy()
1✔
479
        obj.Namespace = n.operatorNamespace
1✔
480

1✔
481
        logger := n.logger.WithValues("ClusterRoleBinding", obj.Name, "Namespace", obj.Namespace)
1✔
482

1✔
483
        // Check if state is disabled and cleanup resource if exists
1✔
484
        if !n.isStateEnabled(n.stateNames[n.idx]) {
1✔
485
                err := n.client.Delete(ctx, obj)
×
486
                if err != nil && !apierrors.IsNotFound(err) {
×
487
                        logger.Info("Couldn't delete", "Error", err)
×
488
                        return gpuv1.NotReady, err
×
489
                }
×
490
                return gpuv1.Disabled, nil
×
491
        }
492

493
        for idx := range obj.Subjects {
2✔
494
                obj.Subjects[idx].Namespace = n.operatorNamespace
1✔
495
        }
1✔
496

497
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
UNCOV
498
                return gpuv1.NotReady, err
×
UNCOV
499
        }
×
500

501
        if err := n.client.Create(ctx, obj); err != nil {
1✔
502
                if apierrors.IsAlreadyExists(err) {
×
503
                        logger.Info("Found Resource, updating...")
×
504
                        err = n.client.Update(ctx, obj)
×
505
                        if err != nil {
×
506
                                logger.Info("Couldn't update", "Error", err)
×
507
                                return gpuv1.NotReady, err
×
508
                        }
×
509
                        return gpuv1.Ready, nil
×
510
                }
511

512
                logger.Info("Couldn't create", "Error", err)
×
513
                return gpuv1.NotReady, err
×
514
        }
515

516
        return gpuv1.Ready, nil
1✔
517
}
518

519
// createConfigMap creates a ConfigMap resource
520
func createConfigMap(n ClusterPolicyController, configMapIdx int) (gpuv1.State, error) {
1✔
521
        ctx := n.ctx
1✔
522
        state := n.idx
1✔
523
        config := n.singleton.Spec
1✔
524
        obj := n.resources[state].ConfigMaps[configMapIdx].DeepCopy()
1✔
525
        obj.Namespace = n.operatorNamespace
1✔
526

1✔
527
        logger := n.logger.WithValues("ConfigMap", obj.Name, "Namespace", obj.Namespace)
1✔
528

1✔
529
        // Check if state is disabled and cleanup resource if exists
1✔
530
        if !n.isStateEnabled(n.stateNames[n.idx]) {
1✔
531
                err := n.client.Delete(ctx, obj)
×
532
                if err != nil && !apierrors.IsNotFound(err) {
×
UNCOV
533
                        logger.Info("Couldn't delete", "Error", err)
×
UNCOV
534
                        return gpuv1.NotReady, err
×
UNCOV
535
                }
×
UNCOV
536
                return gpuv1.Disabled, nil
×
537
        }
538

539
        // avoid creating default 'mig-parted-config' ConfigMap if custom one is provided
540
        if obj.Name == MigPartedDefaultConfigMapName {
1✔
541
                if config.MIGManager.Config != nil && config.MIGManager.Config.Name != "" && config.MIGManager.Config.Name != MigPartedDefaultConfigMapName {
×
UNCOV
542
                        logger.Info(fmt.Sprintf("Not creating resource, custom ConfigMap provided: %s", config.MIGManager.Config.Name))
×
UNCOV
543
                        return gpuv1.Ready, nil
×
544
                }
×
545
        }
546

547
        // avoid creating default 'gpu-clients' ConfigMap if custom one is provided
548
        if obj.Name == MigDefaultGPUClientsConfigMapName {
1✔
549
                if config.MIGManager.GPUClientsConfig != nil && config.MIGManager.GPUClientsConfig.Name != "" {
×
550
                        logger.Info(fmt.Sprintf("Not creating resource, custom ConfigMap provided: %s", config.MIGManager.GPUClientsConfig.Name))
×
551
                        return gpuv1.Ready, nil
×
UNCOV
552
                }
×
553
        }
554

555
        // avoid creating default vGPU device manager ConfigMap if custom one provided
556
        if obj.Name == VgpuDMDefaultConfigMapName {
1✔
UNCOV
557
                if config.VGPUDeviceManager.Config != nil && config.VGPUDeviceManager.Config.Name != "" {
×
UNCOV
558
                        logger.Info(fmt.Sprintf("Not creating resource, custom ConfigMap provided: %s", config.VGPUDeviceManager.Config.Name))
×
UNCOV
559
                        return gpuv1.Ready, nil
×
UNCOV
560
                }
×
561
        }
562

563
        if obj.Name == "nvidia-kata-manager-config" {
1✔
564
                data, err := yaml.Marshal(config.KataManager.Config)
×
565
                if err != nil {
×
566
                        return gpuv1.NotReady, fmt.Errorf("failed to marshal kata manager config: %v", err)
×
567
                }
×
568
                obj.Data = map[string]string{
×
569
                        "config.yaml": string(data),
×
570
                }
×
571
        }
572

573
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
574
                return gpuv1.NotReady, err
×
575
        }
×
576

577
        if err := n.client.Create(ctx, obj); err != nil {
1✔
UNCOV
578
                if !apierrors.IsAlreadyExists(err) {
×
UNCOV
579
                        logger.Info("Couldn't create", "Error", err)
×
UNCOV
580
                        return gpuv1.NotReady, err
×
UNCOV
581
                }
×
582

UNCOV
583
                logger.Info("Found Resource, updating...")
×
UNCOV
584
                err = n.client.Update(ctx, obj)
×
UNCOV
585
                if err != nil {
×
UNCOV
586
                        logger.Info("Couldn't update", "Error", err)
×
UNCOV
587
                        return gpuv1.NotReady, err
×
UNCOV
588
                }
×
589
        }
590

591
        return gpuv1.Ready, nil
1✔
592
}
593

594
// ConfigMaps creates ConfigMap resource(s)
595
func ConfigMaps(n ClusterPolicyController) (gpuv1.State, error) {
1✔
596
        status := gpuv1.Ready
1✔
597
        state := n.idx
1✔
598
        for i := range n.resources[state].ConfigMaps {
2✔
599
                stat, err := createConfigMap(n, i)
1✔
600
                if err != nil {
1✔
601
                        return stat, err
×
602
                }
×
603
                if stat != gpuv1.Ready {
1✔
UNCOV
604
                        status = gpuv1.NotReady
×
UNCOV
605
                }
×
606
        }
607
        return status, nil
1✔
608
}
609

610
// getKernelVersionsMap returns a map of kernel versions to their corresponding OS from all GPU nodes in the cluster
611
func (n ClusterPolicyController) getKernelVersionsMap() (map[string]string, error) {
1✔
612
        kernelVersionMap := make(map[string]string)
1✔
613
        ctx := n.ctx
1✔
614
        logger := n.logger.WithValues("Request.Namespace", "default", "Request.Name", "Node")
1✔
615

1✔
616
        // Filter only GPU nodes
1✔
617
        opts := []client.ListOption{
1✔
618
                client.MatchingLabels{"nvidia.com/gpu.present": "true"},
1✔
619
        }
1✔
620

1✔
621
        list := &corev1.NodeList{}
1✔
622
        err := n.client.List(ctx, list, opts...)
1✔
623
        if err != nil {
1✔
624
                logger.Info("Could not get NodeList", "ERROR", err)
×
625
                return nil, err
×
626
        }
×
627

628
        if len(list.Items) == 0 {
1✔
UNCOV
629
                // none of the nodes matched nvidia GPU label
×
UNCOV
630
                // either the nodes do not have GPUs, or NFD is not running
×
UNCOV
631
                logger.Info("Could not get any nodes to match nvidia.com/gpu.present label")
×
632
                return nil, nil
×
633
        }
×
634

635
        for _, node := range list.Items {
2✔
636
                labels := node.GetLabels()
1✔
637
                if kernelVersion, ok := labels[nfdKernelLabelKey]; ok {
2✔
638
                        logger.Info("Found kernel version label", "version", kernelVersion)
1✔
639
                        // get OS version for this kernel
1✔
640
                        osType := labels[nfdOSReleaseIDLabelKey]
1✔
641
                        osVersion := labels[nfdOSVersionIDLabelKey]
1✔
642
                        nodeOS := fmt.Sprintf("%s%s", osType, osVersion)
1✔
643
                        if os, ok := kernelVersionMap[kernelVersion]; ok {
1✔
644
                                if os != nodeOS {
×
645
                                        return nil, fmt.Errorf("different OS versions found for the same kernel version %s, unsupported configuration", kernelVersion)
×
646
                                }
×
647
                        }
648
                        // add mapping for "kernelVersion" --> "OS"
649
                        kernelVersionMap[kernelVersion] = nodeOS
1✔
UNCOV
650
                } else {
×
UNCOV
651
                        err := apierrors.NewNotFound(schema.GroupResource{Group: "Node", Resource: "Label"}, nfdKernelLabelKey)
×
UNCOV
652
                        logger.Error(err, "Failed to get kernel version of GPU node using Node Feature Discovery (NFD) labels. Is NFD installed in the cluster?")
×
UNCOV
653
                        return nil, err
×
UNCOV
654
                }
×
655
        }
656

657
        return kernelVersionMap, nil
1✔
658
}
659

660
func kernelFullVersion(n ClusterPolicyController) (string, string, string) {
1✔
661
        ctx := n.ctx
1✔
662
        logger := n.logger.WithValues("Request.Namespace", "default", "Request.Name", "Node")
1✔
663
        // We need the node labels to fetch the correct container
1✔
664
        opts := []client.ListOption{
1✔
665
                client.MatchingLabels{"nvidia.com/gpu.present": "true"},
1✔
666
        }
1✔
667

1✔
668
        list := &corev1.NodeList{}
1✔
669
        err := n.client.List(ctx, list, opts...)
1✔
670
        if err != nil {
1✔
UNCOV
671
                logger.Info("Could not get NodeList", "ERROR", err)
×
UNCOV
672
                return "", "", ""
×
UNCOV
673
        }
×
674

675
        if len(list.Items) == 0 {
1✔
UNCOV
676
                // none of the nodes matched nvidia GPU label
×
UNCOV
677
                // either the nodes do not have GPUs, or NFD is not running
×
UNCOV
678
                logger.Info("Could not get any nodes to match nvidia.com/gpu.present label", "ERROR", "")
×
UNCOV
679
                return "", "", ""
×
UNCOV
680
        }
×
681

682
        // Assuming all nodes are running the same kernel version,
683
        // One could easily add driver-kernel-versions for each node.
684
        node := list.Items[0]
1✔
685
        labels := node.GetLabels()
1✔
686

1✔
687
        var ok bool
1✔
688
        kFVersion, ok := labels[nfdKernelLabelKey]
1✔
689
        if ok {
2✔
690
                logger.Info(kFVersion)
1✔
691
        } else {
1✔
UNCOV
692
                err := apierrors.NewNotFound(schema.GroupResource{Group: "Node", Resource: "Label"}, nfdKernelLabelKey)
×
UNCOV
693
                logger.Info("Couldn't get kernelVersion, did you run the node feature discovery?", "Error", err)
×
UNCOV
694
                return "", "", ""
×
UNCOV
695
        }
×
696

697
        osName, ok := labels[nfdOSReleaseIDLabelKey]
1✔
698
        if !ok {
1✔
UNCOV
699
                return kFVersion, "", ""
×
UNCOV
700
        }
×
701
        osVersion, ok := labels[nfdOSVersionIDLabelKey]
1✔
702
        if !ok {
1✔
703
                return kFVersion, "", ""
×
704
        }
×
705
        osTag := fmt.Sprintf("%s%s", osName, osVersion)
1✔
706

1✔
707
        return kFVersion, osTag, osVersion
1✔
708
}
709

710
func preprocessService(obj *corev1.Service, n ClusterPolicyController) error {
1✔
711
        logger := n.logger.WithValues("Service", obj.Name)
1✔
712
        transformations := map[string]func(*corev1.Service, *gpuv1.ClusterPolicySpec) error{
1✔
713
                "nvidia-dcgm-exporter": TransformDCGMExporterService,
1✔
714
        }
1✔
715

1✔
716
        t, ok := transformations[obj.Name]
1✔
717
        if !ok {
1✔
UNCOV
718
                logger.V(2).Info(fmt.Sprintf("No transformation for Service '%s'", obj.Name))
×
UNCOV
719
                return nil
×
UNCOV
720
        }
×
721

722
        err := t(obj, &n.singleton.Spec)
1✔
723
        if err != nil {
1✔
UNCOV
724
                logger.Error(err, "Failed to apply transformation", "Service", obj.Name)
×
UNCOV
725
                return err
×
UNCOV
726
        }
×
727

728
        return nil
1✔
729
}
730

731
func preProcessDaemonSet(obj *appsv1.DaemonSet, n ClusterPolicyController) error {
1✔
732
        logger := n.logger.WithValues("Daemonset", obj.Name)
1✔
733
        transformations := map[string]func(*appsv1.DaemonSet, *gpuv1.ClusterPolicySpec, ClusterPolicyController) error{
1✔
734
                "nvidia-driver-daemonset":                 TransformDriver,
1✔
735
                "nvidia-vgpu-manager-daemonset":           TransformVGPUManager,
1✔
736
                "nvidia-vgpu-device-manager":              TransformVGPUDeviceManager,
1✔
737
                "nvidia-vfio-manager":                     TransformVFIOManager,
1✔
738
                "nvidia-container-toolkit-daemonset":      TransformToolkit,
1✔
739
                "nvidia-dra-driver-kubelet-plugin":        TransformDRADriverKubeletPlugin,
1✔
740
                "nvidia-device-plugin-daemonset":          TransformDevicePlugin,
1✔
741
                "nvidia-device-plugin-mps-control-daemon": TransformMPSControlDaemon,
1✔
742
                "nvidia-sandbox-device-plugin-daemonset":  TransformSandboxDevicePlugin,
1✔
743
                "nvidia-dcgm":                             TransformDCGM,
1✔
744
                "nvidia-dcgm-exporter":                    TransformDCGMExporter,
1✔
745
                "nvidia-node-status-exporter":             TransformNodeStatusExporter,
1✔
746
                "gpu-feature-discovery":                   TransformGPUDiscoveryPlugin,
1✔
747
                "nvidia-mig-manager":                      TransformMIGManager,
1✔
748
                "nvidia-operator-validator":               TransformValidator,
1✔
749
                "nvidia-sandbox-validator":                TransformSandboxValidator,
1✔
750
                "nvidia-kata-manager":                     TransformKataManager,
1✔
751
                "nvidia-cc-manager":                       TransformCCManager,
1✔
752
        }
1✔
753

1✔
754
        t, ok := transformations[obj.Name]
1✔
755
        if !ok {
1✔
UNCOV
756
                logger.Info(fmt.Sprintf("No transformation for Daemonset '%s'", obj.Name))
×
757
                return nil
×
758
        }
×
759

760
        // apply common Daemonset configuration that is applicable to all
761
        err := applyCommonDaemonsetConfig(obj, &n.singleton.Spec)
1✔
762
        if err != nil {
1✔
UNCOV
763
                logger.Error(err, "Failed to apply common Daemonset transformation", "resource", obj.Name)
×
UNCOV
764
                return err
×
UNCOV
765
        }
×
766

767
        // transform the host-root and host-dev-char volumes if a custom host root is configured with the operator
768
        transformForHostRoot(obj, n.singleton.Spec.HostPaths.RootFS)
1✔
769

1✔
770
        // transform the driver-root volume if a custom driver install dir is configured with the operator
1✔
771
        transformForDriverInstallDir(obj, n.singleton.Spec.HostPaths.DriverInstallDir)
1✔
772

1✔
773
        // apply per operand Daemonset config
1✔
774
        err = t(obj, &n.singleton.Spec, n)
1✔
775
        if err != nil {
1✔
776
                logger.Error(err, "Failed to apply transformation", "resource", obj.Name)
×
777
                return err
×
778
        }
×
779

780
        // apply custom Labels and Annotations to the podSpec if any
781
        applyCommonDaemonsetMetadata(obj, &n.singleton.Spec.Daemonsets)
1✔
782

1✔
783
        return nil
1✔
784
}
785

786
// applyCommonDaemonsetMetadata adds additional labels and annotations to the daemonset podSpec if there are any specified
787
// by the user in the podSpec
788
func applyCommonDaemonsetMetadata(obj *appsv1.DaemonSet, dsSpec *gpuv1.DaemonsetsSpec) {
1✔
789
        if len(dsSpec.Labels) > 0 {
2✔
790
                if obj.Spec.Template.Labels == nil {
2✔
791
                        obj.Spec.Template.Labels = make(map[string]string)
1✔
792
                }
1✔
793
                for labelKey, labelValue := range dsSpec.Labels {
2✔
794
                        // if the user specifies an override of the "app" or the ""app.kubernetes.io/part-of"" key, we skip it.
1✔
795
                        // DaemonSet pod selectors are immutable, so we still want the pods to be selectable as before and working
1✔
796
                        // with the existing daemon set selectors.
1✔
797
                        if labelKey == "app" || labelKey == "app.kubernetes.io/part-of" {
2✔
798
                                continue
1✔
799
                        }
800
                        obj.Spec.Template.Labels[labelKey] = labelValue
1✔
801
                }
802
        }
803

804
        if len(dsSpec.Annotations) > 0 {
2✔
805
                if obj.Spec.Template.Annotations == nil {
2✔
806
                        obj.Spec.Template.Annotations = make(map[string]string)
1✔
807
                }
1✔
808
                for annoKey, annoVal := range dsSpec.Annotations {
2✔
809
                        obj.Spec.Template.Annotations[annoKey] = annoVal
1✔
810
                }
1✔
811
        }
812
}
813

814
// Apply common config that is applicable for all Daemonsets
815
func applyCommonDaemonsetConfig(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) error {
1✔
816
        // apply daemonset update strategy
1✔
817
        err := applyUpdateStrategyConfig(obj, config)
1✔
818
        if err != nil {
2✔
819
                return err
1✔
820
        }
1✔
821

822
        // update PriorityClass
823
        if config.Daemonsets.PriorityClassName != "" {
2✔
824
                obj.Spec.Template.Spec.PriorityClassName = config.Daemonsets.PriorityClassName
1✔
825
        }
1✔
826

827
        // set tolerations if specified
828
        if len(config.Daemonsets.Tolerations) > 0 {
2✔
829
                obj.Spec.Template.Spec.Tolerations = config.Daemonsets.Tolerations
1✔
830
        }
1✔
831
        return nil
1✔
832
}
833

834
// apply necessary transforms if a custom host root path is configured
835
func transformForHostRoot(obj *appsv1.DaemonSet, hostRoot string) {
1✔
836
        if hostRoot == "" || hostRoot == "/" {
2✔
837
                return
1✔
838
        }
1✔
839

840
        transformHostRootVolume(obj, hostRoot)
1✔
841
        transformHostDevCharVolume(obj, hostRoot)
1✔
842
}
843

844
func transformHostRootVolume(obj *appsv1.DaemonSet, hostRoot string) {
1✔
845
        containsHostRootVolume := false
1✔
846
        for _, volume := range obj.Spec.Template.Spec.Volumes {
2✔
847
                if volume.Name == "host-root" {
2✔
848
                        volume.HostPath.Path = hostRoot
1✔
849
                        containsHostRootVolume = true
1✔
850
                        break
1✔
851
                }
852
        }
853

854
        if !containsHostRootVolume {
2✔
855
                return
1✔
856
        }
1✔
857

858
        for index := range obj.Spec.Template.Spec.InitContainers {
1✔
UNCOV
859
                setContainerEnv(&(obj.Spec.Template.Spec.InitContainers[index]), HostRootEnvName, hostRoot)
×
UNCOV
860
        }
×
861

862
        for index := range obj.Spec.Template.Spec.Containers {
2✔
863
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[index]), HostRootEnvName, hostRoot)
1✔
864
        }
1✔
865
}
866

867
func transformHostDevCharVolume(obj *appsv1.DaemonSet, hostRoot string) {
1✔
868
        for _, volume := range obj.Spec.Template.Spec.Volumes {
2✔
869
                if volume.Name == "host-dev-char" {
2✔
870
                        volume.HostPath.Path = filepath.Join(hostRoot, "/dev/char")
1✔
871
                        break
1✔
872
                }
873
        }
874
}
875

876
// apply necessary transforms if a custom driver install directory is configured
877
func transformForDriverInstallDir(obj *appsv1.DaemonSet, driverInstallDir string) {
1✔
878
        if driverInstallDir == "" || driverInstallDir == DefaultDriverInstallDir {
2✔
879
                return
1✔
880
        }
1✔
881

882
        containsDriverInstallDirVolume := false
1✔
883
        podSpec := obj.Spec.Template.Spec
1✔
884
        for _, volume := range podSpec.Volumes {
2✔
885
                if volume.Name == "driver-install-dir" {
2✔
886
                        volume.HostPath.Path = driverInstallDir
1✔
887
                        containsDriverInstallDirVolume = true
1✔
888
                        break
1✔
889
                }
890
        }
891

892
        if !containsDriverInstallDirVolume {
2✔
893
                return
1✔
894
        }
1✔
895

896
        for i, ctr := range podSpec.InitContainers {
2✔
897
                if ctr.Name == "driver-validation" {
2✔
898
                        setContainerEnv(&(podSpec.InitContainers[i]), DriverInstallDirEnvName, driverInstallDir)
1✔
899
                        setContainerEnv(&(podSpec.InitContainers[i]), DriverInstallDirCtrPathEnvName, driverInstallDir)
1✔
900
                        for j, volumeMount := range ctr.VolumeMounts {
2✔
901
                                if volumeMount.Name == "driver-install-dir" {
2✔
902
                                        podSpec.InitContainers[i].VolumeMounts[j].MountPath = driverInstallDir
1✔
903
                                }
1✔
904
                        }
905
                }
906
        }
907
}
908

909
// TransformGPUDiscoveryPlugin transforms GPU discovery daemonset with required config as per ClusterPolicy
UNCOV
910
func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
×
UNCOV
911
        // update validation container
×
UNCOV
912
        err := transformValidationInitContainer(obj, config)
×
UNCOV
913
        if err != nil {
×
UNCOV
914
                return err
×
UNCOV
915
        }
×
916

917
        // update image
918
        img, err := gpuv1.ImagePath(&config.GPUFeatureDiscovery)
×
UNCOV
919
        if err != nil {
×
UNCOV
920
                return err
×
UNCOV
921
        }
×
UNCOV
922
        obj.Spec.Template.Spec.Containers[0].Image = img
×
UNCOV
923

×
UNCOV
924
        // update image pull policy
×
UNCOV
925
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.GPUFeatureDiscovery.ImagePullPolicy)
×
UNCOV
926

×
UNCOV
927
        // set image pull secrets
×
UNCOV
928
        if len(config.GPUFeatureDiscovery.ImagePullSecrets) > 0 {
×
UNCOV
929
                addPullSecrets(&obj.Spec.Template.Spec, config.GPUFeatureDiscovery.ImagePullSecrets)
×
UNCOV
930
        }
×
931

932
        // set resource limits
UNCOV
933
        if config.GPUFeatureDiscovery.Resources != nil {
×
UNCOV
934
                // apply resource limits to all containers
×
UNCOV
935
                for i := range obj.Spec.Template.Spec.Containers {
×
UNCOV
936
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.GPUFeatureDiscovery.Resources.Requests
×
UNCOV
937
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.GPUFeatureDiscovery.Resources.Limits
×
UNCOV
938
                }
×
939
        }
940

941
        // set arguments if specified for driver container
UNCOV
942
        if len(config.GPUFeatureDiscovery.Args) > 0 {
×
UNCOV
943
                obj.Spec.Template.Spec.Containers[0].Args = config.GPUFeatureDiscovery.Args
×
UNCOV
944
        }
×
945

946
        // set/append environment variables for exporter container
UNCOV
947
        if len(config.GPUFeatureDiscovery.Env) > 0 {
×
UNCOV
948
                for _, env := range config.GPUFeatureDiscovery.Env {
×
UNCOV
949
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
×
UNCOV
950
                }
×
951
        }
952

953
        // apply plugin configuration through ConfigMap if one is provided
UNCOV
954
        err = handleDevicePluginConfig(obj, config)
×
UNCOV
955
        if err != nil {
×
UNCOV
956
                return err
×
UNCOV
957
        }
×
958

959
        // set RuntimeClass for supported runtimes
UNCOV
960
        setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
×
UNCOV
961

×
UNCOV
962
        // update env required for MIG support
×
UNCOV
963
        applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy)
×
UNCOV
964

×
UNCOV
965
        return nil
×
966
}
967

968
// Read and parse os-release file
969
func parseOSRelease() (map[string]string, error) {
1✔
970
        release := map[string]string{}
1✔
971

1✔
972
        // TODO: mock this call instead
1✔
973
        if os.Getenv("UNIT_TEST") == "true" {
2✔
974
                return release, nil
1✔
975
        }
1✔
976

977
        f, err := os.Open("/host-etc/os-release")
×
978
        if err != nil {
×
979
                return nil, err
×
980
        }
×
981

982
        re := regexp.MustCompile(`^(?P<key>\w+)=(?P<value>.+)`)
×
983

×
984
        // Read line-by-line
×
985
        s := bufio.NewScanner(f)
×
986
        for s.Scan() {
×
987
                line := s.Text()
×
988
                if m := re.FindStringSubmatch(line); m != nil {
×
UNCOV
989
                        release[m[1]] = strings.Trim(m[2], `"`)
×
UNCOV
990
                }
×
991
        }
992
        return release, nil
×
993
}
994

995
func TransformDCGMExporterService(obj *corev1.Service, config *gpuv1.ClusterPolicySpec) error {
1✔
996
        serviceConfig := config.DCGMExporter.ServiceSpec
1✔
997
        if serviceConfig != nil {
1✔
UNCOV
998
                if len(serviceConfig.Type) > 0 {
×
UNCOV
999
                        obj.Spec.Type = serviceConfig.Type
×
1000
                }
×
1001

1002
                if serviceConfig.InternalTrafficPolicy != nil {
×
UNCOV
1003
                        obj.Spec.InternalTrafficPolicy = serviceConfig.InternalTrafficPolicy
×
UNCOV
1004
                }
×
1005
        }
1006
        return nil
1✔
1007
}
1008

1009
// TransformDriver transforms Nvidia driver daemonset with required config as per ClusterPolicy
1010
func TransformDriver(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1011
        // update validation container
1✔
1012
        err := transformValidationInitContainer(obj, config)
1✔
1013
        if err != nil {
1✔
1014
                return err
×
1015
        }
×
1016

1017
        // update driver-manager initContainer
1018
        err = transformDriverManagerInitContainer(obj, &config.Driver.Manager, config.Driver.GPUDirectRDMA)
1✔
1019
        if err != nil {
1✔
1020
                return err
×
1021
        }
×
1022

1023
        // update nvidia-driver container
1024
        err = transformDriverContainer(obj, config, n)
1✔
1025
        if err != nil {
1✔
UNCOV
1026
                return err
×
UNCOV
1027
        }
×
1028

1029
        // update nvidia-peermem sidecar container
1030
        err = transformPeerMemoryContainer(obj, config, n)
1✔
1031
        if err != nil {
1✔
UNCOV
1032
                return err
×
UNCOV
1033
        }
×
1034

1035
        // update nvidia-fs sidecar container
1036
        err = transformGDSContainer(obj, config, n)
1✔
1037
        if err != nil {
1✔
1038
                return err
×
UNCOV
1039
        }
×
1040

1041
        // updated nvidia-gdrcopy sidecar container
1042
        err = transformGDRCopyContainer(obj, config, n)
1✔
1043
        if err != nil {
1✔
1044
                return err
×
1045
        }
×
1046

1047
        // update/remove OpenShift Driver Toolkit sidecar container
1048
        err = transformOpenShiftDriverToolkitContainer(obj, config, n, "nvidia-driver-ctr")
1✔
1049
        if err != nil {
1✔
1050
                return fmt.Errorf("ERROR: failed to transform the Driver Toolkit Container: %s", err)
×
UNCOV
1051
        }
×
1052

1053
        // updates for per kernel version pods using pre-compiled drivers
1054
        if config.Driver.UsePrecompiledDrivers() {
2✔
1055
                err = transformPrecompiledDriverDaemonset(obj, n)
1✔
1056
                if err != nil {
1✔
1057
                        return fmt.Errorf("ERROR: failed to transform the pre-compiled Driver Daemonset: %s", err)
×
1058
                }
×
1059
        }
1060
        return nil
1✔
1061
}
1062

1063
// TransformVGPUManager transforms NVIDIA vGPU Manager daemonset with required config as per ClusterPolicy
1064
func TransformVGPUManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1065
        // update k8s-driver-manager initContainer
1✔
1066
        err := transformDriverManagerInitContainer(obj, &config.VGPUManager.DriverManager, nil)
1✔
1067
        if err != nil {
1✔
UNCOV
1068
                return fmt.Errorf("failed to transform k8s-driver-manager initContainer for vGPU Manager: %v", err)
×
UNCOV
1069
        }
×
1070

1071
        // update nvidia-vgpu-manager container
1072
        err = transformVGPUManagerContainer(obj, config, n)
1✔
1073
        if err != nil {
1✔
UNCOV
1074
                return fmt.Errorf("failed to transform vGPU Manager container: %v", err)
×
UNCOV
1075
        }
×
1076

1077
        // update OpenShift Driver Toolkit sidecar container
1078
        err = transformOpenShiftDriverToolkitContainer(obj, config, n, "nvidia-vgpu-manager-ctr")
1✔
1079
        if err != nil {
1✔
UNCOV
1080
                return fmt.Errorf("failed to transform the Driver Toolkit container: %s", err)
×
UNCOV
1081
        }
×
1082

1083
        return nil
1✔
1084
}
1085

1086
// applyOCPProxySpec applies proxy settings to podSpec
UNCOV
1087
func applyOCPProxySpec(n ClusterPolicyController, podSpec *corev1.PodSpec) error {
×
UNCOV
1088
        // Pass HTTPS_PROXY, HTTP_PROXY and NO_PROXY env if set in clusterwide proxy for OCP
×
UNCOV
1089
        proxy, err := GetClusterWideProxy(n.ctx)
×
1090
        if err != nil {
×
1091
                return fmt.Errorf("ERROR: failed to get clusterwide proxy object: %s", err)
×
UNCOV
1092
        }
×
1093

UNCOV
1094
        if proxy == nil {
×
UNCOV
1095
                // no clusterwide proxy configured
×
1096
                return nil
×
1097
        }
×
1098

UNCOV
1099
        for i, container := range podSpec.Containers {
×
UNCOV
1100
                // skip if not nvidia-driver container
×
UNCOV
1101
                if !strings.Contains(container.Name, "nvidia-driver") {
×
1102
                        continue
×
1103
                }
1104

UNCOV
1105
                proxyEnv := getProxyEnv(proxy)
×
UNCOV
1106
                if len(proxyEnv) != 0 {
×
UNCOV
1107
                        podSpec.Containers[i].Env = append(podSpec.Containers[i].Env, proxyEnv...)
×
1108
                }
×
1109

1110
                // if user-ca-bundle is setup in proxy,  create a trusted-ca configmap and add volume mount
UNCOV
1111
                if proxy.Spec.TrustedCA.Name == "" {
×
UNCOV
1112
                        return nil
×
UNCOV
1113
                }
×
1114

1115
                // create trusted-ca configmap to inject custom user ca bundle into it
1116
                _, err = getOrCreateTrustedCAConfigMap(n, TrustedCAConfigMapName)
×
UNCOV
1117
                if err != nil {
×
UNCOV
1118
                        return err
×
UNCOV
1119
                }
×
1120

1121
                // mount trusted-ca configmap
UNCOV
1122
                podSpec.Containers[i].VolumeMounts = append(podSpec.Containers[i].VolumeMounts,
×
UNCOV
1123
                        corev1.VolumeMount{
×
UNCOV
1124
                                Name:      TrustedCAConfigMapName,
×
UNCOV
1125
                                ReadOnly:  true,
×
1126
                                MountPath: TrustedCABundleMountDir,
×
1127
                        })
×
UNCOV
1128
                podSpec.Volumes = append(podSpec.Volumes,
×
UNCOV
1129
                        corev1.Volume{
×
UNCOV
1130
                                Name: TrustedCAConfigMapName,
×
UNCOV
1131
                                VolumeSource: corev1.VolumeSource{
×
1132
                                        ConfigMap: &corev1.ConfigMapVolumeSource{
×
1133
                                                LocalObjectReference: corev1.LocalObjectReference{
×
UNCOV
1134
                                                        Name: TrustedCAConfigMapName,
×
UNCOV
1135
                                                },
×
UNCOV
1136
                                                Items: []corev1.KeyToPath{
×
UNCOV
1137
                                                        {
×
1138
                                                                Key:  TrustedCABundleFileName,
×
1139
                                                                Path: TrustedCACertificate,
×
UNCOV
1140
                                                        },
×
UNCOV
1141
                                                },
×
UNCOV
1142
                                        },
×
UNCOV
1143
                                },
×
UNCOV
1144
                        })
×
1145
        }
1146
        return nil
×
1147
}
1148

1149
// getOrCreateTrustedCAConfigMap creates or returns an existing Trusted CA Bundle ConfigMap.
1150
func getOrCreateTrustedCAConfigMap(n ClusterPolicyController, name string) (*corev1.ConfigMap, error) {
×
UNCOV
1151
        ctx := n.ctx
×
1152
        configMap := &corev1.ConfigMap{
×
1153
                TypeMeta: metav1.TypeMeta{
×
1154
                        Kind:       "ConfigMap",
×
1155
                        APIVersion: corev1.SchemeGroupVersion.String(),
×
UNCOV
1156
                },
×
1157
                ObjectMeta: metav1.ObjectMeta{
×
1158
                        Name:      name,
×
1159
                        Namespace: n.operatorNamespace,
×
1160
                },
×
UNCOV
1161
                Data: map[string]string{
×
UNCOV
1162
                        TrustedCABundleFileName: "",
×
1163
                },
×
1164
        }
×
1165

×
1166
        // apply label "config.openshift.io/inject-trusted-cabundle: true", so that cert is automatically filled/updated.
×
UNCOV
1167
        configMap.Labels = make(map[string]string)
×
UNCOV
1168
        configMap.Labels["config.openshift.io/inject-trusted-cabundle"] = "true"
×
1169

×
1170
        logger := n.logger.WithValues("ConfigMap", configMap.Name, "Namespace", configMap.Namespace)
×
1171

×
UNCOV
1172
        if err := controllerutil.SetControllerReference(n.singleton, configMap, n.scheme); err != nil {
×
UNCOV
1173
                return nil, err
×
1174
        }
×
1175

1176
        found := &corev1.ConfigMap{}
×
1177
        err := n.client.Get(ctx, types.NamespacedName{Namespace: configMap.Namespace, Name: configMap.Name}, found)
×
UNCOV
1178
        if err != nil && apierrors.IsNotFound(err) {
×
UNCOV
1179
                logger.Info("Not found, creating")
×
1180
                err = n.client.Create(ctx, configMap)
×
1181
                if err != nil {
×
1182
                        logger.Info("Couldn't create")
×
1183
                        return nil, fmt.Errorf("failed to create trusted CA bundle config map %q: %s", name, err)
×
1184
                }
×
1185
                return configMap, nil
×
1186
        } else if err != nil {
×
1187
                return nil, fmt.Errorf("failed to get trusted CA bundle config map %q: %s", name, err)
×
1188
        }
×
1189

1190
        return found, nil
×
1191
}
1192

1193
// get proxy env variables from cluster wide proxy in OCP
1194
func getProxyEnv(proxyConfig *apiconfigv1.Proxy) []corev1.EnvVar {
×
1195
        envVars := []corev1.EnvVar{}
×
1196
        if proxyConfig == nil {
×
1197
                return envVars
×
1198
        }
×
1199
        proxies := map[string]string{
×
1200
                "HTTPS_PROXY": proxyConfig.Spec.HTTPSProxy,
×
1201
                "HTTP_PROXY":  proxyConfig.Spec.HTTPProxy,
×
1202
                "NO_PROXY":    proxyConfig.Spec.NoProxy,
×
UNCOV
1203
        }
×
1204
        var envs []string
×
UNCOV
1205
        for k := range proxies {
×
UNCOV
1206
                envs = append(envs, k)
×
UNCOV
1207
        }
×
1208
        // ensure ordering is preserved when we add these env to pod spec
1209
        sort.Strings(envs)
×
1210

×
1211
        for _, e := range envs {
×
1212
                v := proxies[e]
×
1213
                if len(v) == 0 {
×
1214
                        continue
×
1215
                }
1216
                upperCaseEnvvar := corev1.EnvVar{
×
1217
                        Name:  strings.ToUpper(e),
×
1218
                        Value: v,
×
1219
                }
×
1220
                lowerCaseEnvvar := corev1.EnvVar{
×
1221
                        Name:  strings.ToLower(e),
×
1222
                        Value: v,
×
1223
                }
×
1224
                envVars = append(envVars, upperCaseEnvvar, lowerCaseEnvvar)
×
1225
        }
1226

1227
        return envVars
×
1228
}
1229

1230
// TransformToolkit transforms Nvidia container-toolkit daemonset with required config as per ClusterPolicy
1231
func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1232
        // update validation container
1✔
1233
        err := transformValidationInitContainer(obj, config)
1✔
1234
        if err != nil {
1✔
1235
                return err
×
1236
        }
×
1237
        // update image
1238
        image, err := gpuv1.ImagePath(&config.Toolkit)
1✔
1239
        if err != nil {
1✔
1240
                return err
×
1241
        }
×
1242
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
1243

1✔
1244
        // update image pull policy
1✔
1245
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.Toolkit.ImagePullPolicy)
1✔
1246

1✔
1247
        // set image pull secrets
1✔
1248
        if len(config.Toolkit.ImagePullSecrets) > 0 {
2✔
1249
                addPullSecrets(&obj.Spec.Template.Spec, config.Toolkit.ImagePullSecrets)
1✔
1250
        }
1✔
1251

1252
        // set resource limits
1253
        if config.Toolkit.Resources != nil {
2✔
1254
                // apply resource limits to all containers
1✔
1255
                for i := range obj.Spec.Template.Spec.Containers {
2✔
1256
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.Toolkit.Resources.Requests
1✔
1257
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.Toolkit.Resources.Limits
1✔
1258
                }
1✔
1259
        }
1260

1261
        // update env required for CDI support
1262
        if config.CDI.IsEnabled() {
1✔
1263
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true")
×
1264
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeCDIPrefixesEnvName, "nvidia.cdi.k8s.io/")
×
1265
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CrioConfigModeEnvName, "config")
×
UNCOV
1266
                if config.CDI.IsDefault() {
×
1267
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeModeEnvName, "cdi")
×
1268
                }
×
1269
        }
1270

1271
        // set install directory for the toolkit
1272
        if config.Toolkit.InstallDir != "" && config.Toolkit.InstallDir != DefaultToolkitInstallDir {
1✔
UNCOV
1273
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), ToolkitInstallDirEnvName, config.Toolkit.InstallDir)
×
1274

×
1275
                for i, volume := range obj.Spec.Template.Spec.Volumes {
×
1276
                        if volume.Name == "toolkit-install-dir" {
×
1277
                                obj.Spec.Template.Spec.Volumes[i].HostPath.Path = config.Toolkit.InstallDir
×
1278
                                break
×
1279
                        }
1280
                }
1281

1282
                for i, volumeMount := range obj.Spec.Template.Spec.Containers[0].VolumeMounts {
×
UNCOV
1283
                        if volumeMount.Name == "toolkit-install-dir" {
×
UNCOV
1284
                                obj.Spec.Template.Spec.Containers[0].VolumeMounts[i].MountPath = config.Toolkit.InstallDir
×
1285
                                break
×
1286
                        }
1287
                }
1288
        }
1289

1290
        // configure runtime
1291
        runtime := n.runtime.String()
1✔
1292
        err = transformForRuntime(obj, config, runtime, "nvidia-container-toolkit-ctr")
1✔
1293
        if err != nil {
1✔
1294
                return fmt.Errorf("error transforming toolkit daemonset : %w", err)
×
UNCOV
1295
        }
×
1296

1297
        // Update CRI-O hooks path to use default path for non OCP cases
1298
        if n.openshift == "" && n.runtime == gpuv1.CRIO {
1✔
1299
                for index, volume := range obj.Spec.Template.Spec.Volumes {
×
UNCOV
1300
                        if volume.Name == "crio-hooks" {
×
UNCOV
1301
                                obj.Spec.Template.Spec.Volumes[index].HostPath.Path = "/usr/share/containers/oci/hooks.d"
×
UNCOV
1302
                        }
×
1303
                }
1304
        }
1305

1306
        if len(config.Toolkit.Env) > 0 {
2✔
1307
                for _, env := range config.Toolkit.Env {
2✔
1308
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
1✔
1309
                }
1✔
1310
        }
1311

1312
        return nil
1✔
1313
}
1314

1315
func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, runtime string, containerName string) error {
1✔
1316
        var mainContainer *corev1.Container
1✔
1317
        for i, ctr := range obj.Spec.Template.Spec.Containers {
2✔
1318
                if ctr.Name == containerName {
2✔
1319
                        mainContainer = &obj.Spec.Template.Spec.Containers[i]
1✔
1320
                        break
1✔
1321
                }
1322
        }
1323
        if mainContainer == nil {
1✔
1324
                return fmt.Errorf("failed to find main container %q", containerName)
×
1325
        }
×
1326

1327
        setContainerEnv(mainContainer, "RUNTIME", runtime)
1✔
1328

1✔
1329
        if runtime == gpuv1.Containerd.String() {
2✔
1330
                // Set the runtime class name that is to be configured for containerd
1✔
1331
                setContainerEnv(mainContainer, "CONTAINERD_RUNTIME_CLASS", getRuntimeClass(config))
1✔
1332
        }
1✔
1333

1334
        // setup mounts for runtime config file
1335
        runtimeConfigFile, err := getRuntimeConfigFile(mainContainer, runtime)
1✔
1336
        if err != nil {
1✔
UNCOV
1337
                return fmt.Errorf("error getting path to runtime config file: %v", err)
×
UNCOV
1338
        }
×
1339
        sourceConfigFileName := path.Base(runtimeConfigFile)
1✔
1340

1✔
1341
        var configEnvvarName string
1✔
1342
        switch runtime {
1✔
1343
        case gpuv1.Containerd.String():
1✔
1344
                configEnvvarName = "CONTAINERD_CONFIG"
1✔
UNCOV
1345
        case gpuv1.Docker.String():
×
UNCOV
1346
                configEnvvarName = "DOCKER_CONFIG"
×
1347
        case gpuv1.CRIO.String():
1✔
1348
                configEnvvarName = "CRIO_CONFIG"
1✔
1349
        }
1350

1351
        setContainerEnv(mainContainer, "RUNTIME_CONFIG", DefaultRuntimeConfigTargetDir+sourceConfigFileName)
1✔
1352
        setContainerEnv(mainContainer, configEnvvarName, DefaultRuntimeConfigTargetDir+sourceConfigFileName)
1✔
1353

1✔
1354
        volMountConfigName := fmt.Sprintf("%s-config", runtime)
1✔
1355
        volMountConfig := corev1.VolumeMount{Name: volMountConfigName, MountPath: DefaultRuntimeConfigTargetDir}
1✔
1356
        mainContainer.VolumeMounts = append(mainContainer.VolumeMounts, volMountConfig)
1✔
1357

1✔
1358
        configVol := corev1.Volume{Name: volMountConfigName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: path.Dir(runtimeConfigFile), Type: newHostPathType(corev1.HostPathDirectoryOrCreate)}}}
1✔
1359
        obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, configVol)
1✔
1360

1✔
1361
        // setup mounts for runtime socket file
1✔
1362
        runtimeSocketFile, err := getRuntimeSocketFile(mainContainer, runtime)
1✔
1363
        if err != nil {
1✔
UNCOV
1364
                return fmt.Errorf("error getting path to runtime socket: %w", err)
×
UNCOV
1365
        }
×
1366
        if runtimeSocketFile != "" {
2✔
1367
                sourceSocketFileName := path.Base(runtimeSocketFile)
1✔
1368
                // set envvar for runtime socket
1✔
1369
                var socketEnvvarName string
1✔
1370
                if runtime == gpuv1.Containerd.String() {
2✔
1371
                        socketEnvvarName = "CONTAINERD_SOCKET"
1✔
1372
                } else if runtime == gpuv1.Docker.String() {
1✔
UNCOV
1373
                        socketEnvvarName = "DOCKER_SOCKET"
×
UNCOV
1374
                }
×
1375
                setContainerEnv(mainContainer, "RUNTIME_SOCKET", DefaultRuntimeSocketTargetDir+sourceSocketFileName)
1✔
1376
                setContainerEnv(mainContainer, socketEnvvarName, DefaultRuntimeSocketTargetDir+sourceSocketFileName)
1✔
1377

1✔
1378
                volMountSocketName := fmt.Sprintf("%s-socket", runtime)
1✔
1379
                volMountSocket := corev1.VolumeMount{Name: volMountSocketName, MountPath: DefaultRuntimeSocketTargetDir}
1✔
1380
                mainContainer.VolumeMounts = append(mainContainer.VolumeMounts, volMountSocket)
1✔
1381

1✔
1382
                socketVol := corev1.Volume{Name: volMountSocketName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: path.Dir(runtimeSocketFile)}}}
1✔
1383
                obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, socketVol)
1✔
1384
        }
1385
        return nil
1✔
1386
}
1387

1388
// TransformDevicePlugin transforms k8s-device-plugin daemonset with required config as per ClusterPolicy
1389
func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1390
        // update validation container
1✔
1391
        err := transformValidationInitContainer(obj, config)
1✔
1392
        if err != nil {
1✔
UNCOV
1393
                return err
×
UNCOV
1394
        }
×
1395

1396
        // update image
1397
        image, err := gpuv1.ImagePath(&config.DevicePlugin)
1✔
1398
        if err != nil {
1✔
UNCOV
1399
                return err
×
UNCOV
1400
        }
×
1401
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
1402

1✔
1403
        // update image pull policy
1✔
1404
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.DevicePlugin.ImagePullPolicy)
1✔
1405

1✔
1406
        // set image pull secrets
1✔
1407
        if len(config.DevicePlugin.ImagePullSecrets) > 0 {
2✔
1408
                addPullSecrets(&obj.Spec.Template.Spec, config.DevicePlugin.ImagePullSecrets)
1✔
1409
        }
1✔
1410

1411
        // set resource limits
1412
        if config.DevicePlugin.Resources != nil {
1✔
UNCOV
1413
                // apply resource limits to all containers
×
UNCOV
1414
                for i := range obj.Spec.Template.Spec.Containers {
×
UNCOV
1415
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.DevicePlugin.Resources.Requests
×
UNCOV
1416
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.DevicePlugin.Resources.Limits
×
UNCOV
1417
                }
×
1418
        }
1419
        // set arguments if specified for device-plugin container
1420
        if len(config.DevicePlugin.Args) > 0 {
2✔
1421
                obj.Spec.Template.Spec.Containers[0].Args = config.DevicePlugin.Args
1✔
1422
        }
1✔
1423

1424
        // add env to allow injection of /dev/nvidia-fs and /dev/infiniband devices for GDS
1425
        if config.GPUDirectStorage != nil && config.GPUDirectStorage.IsEnabled() {
1✔
UNCOV
1426
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), GDSEnabledEnvName, "true")
×
UNCOV
1427
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), MOFEDEnabledEnvName, "true")
×
UNCOV
1428
        }
×
1429

1430
        // apply plugin configuration through ConfigMap if one is provided
1431
        err = handleDevicePluginConfig(obj, config)
1✔
1432
        if err != nil {
1✔
UNCOV
1433
                return err
×
UNCOV
1434
        }
×
1435

1436
        // set RuntimeClass for supported runtimes
1437
        setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
1✔
1438

1✔
1439
        // update env required for MIG support
1✔
1440
        applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy)
1✔
1441

1✔
1442
        // update env required for CDI support
1✔
1443
        if config.CDI.IsEnabled() {
1✔
UNCOV
1444
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true")
×
UNCOV
1445
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DeviceListStrategyEnvName, "envvar,cdi-annotations")
×
UNCOV
1446
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIAnnotationPrefixEnvName, "nvidia.cdi.k8s.io/")
×
UNCOV
1447
                if config.Toolkit.IsEnabled() {
×
UNCOV
1448
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook"))
×
UNCOV
1449
                }
×
1450
        }
1451

1452
        // update MPS volumes and set MPS_ROOT env var if a custom MPS root is configured
1453
        if config.DevicePlugin.MPS != nil && config.DevicePlugin.MPS.Root != "" &&
1✔
1454
                config.DevicePlugin.MPS.Root != DefaultMPSRoot {
1✔
UNCOV
1455
                for i, volume := range obj.Spec.Template.Spec.Volumes {
×
UNCOV
1456
                        switch volume.Name {
×
1457
                        case "mps-root":
×
1458
                                obj.Spec.Template.Spec.Volumes[i].HostPath.Path = config.DevicePlugin.MPS.Root
×
UNCOV
1459
                        case "mps-shm":
×
UNCOV
1460
                                obj.Spec.Template.Spec.Volumes[i].HostPath.Path = filepath.Join(config.DevicePlugin.MPS.Root, "shm")
×
1461
                        }
1462
                }
UNCOV
1463
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), MPSRootEnvName, config.DevicePlugin.MPS.Root)
×
1464
        }
1465

1466
        if len(config.DevicePlugin.Env) > 0 {
2✔
1467
                for _, env := range config.DevicePlugin.Env {
2✔
1468
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
1✔
1469
                }
1✔
1470
        }
1471

1472
        return nil
1✔
1473
}
1474

1475
func TransformMPSControlDaemon(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
×
UNCOV
1476
        // update validation container
×
UNCOV
1477
        err := transformValidationInitContainer(obj, config)
×
UNCOV
1478
        if err != nil {
×
UNCOV
1479
                return err
×
UNCOV
1480
        }
×
1481

UNCOV
1482
        image, err := gpuv1.ImagePath(&config.DevicePlugin)
×
UNCOV
1483
        if err != nil {
×
1484
                return err
×
1485
        }
×
1486
        imagePullPolicy := gpuv1.ImagePullPolicy(config.DevicePlugin.ImagePullPolicy)
×
UNCOV
1487

×
UNCOV
1488
        // update image path and imagePullPolicy for 'mps-control-daemon-mounts' initContainer
×
UNCOV
1489
        for i, initCtr := range obj.Spec.Template.Spec.InitContainers {
×
UNCOV
1490
                if initCtr.Name == "mps-control-daemon-mounts" {
×
1491
                        obj.Spec.Template.Spec.InitContainers[i].Image = image
×
1492
                        obj.Spec.Template.Spec.InitContainers[i].ImagePullPolicy = imagePullPolicy
×
UNCOV
1493
                        break
×
1494
                }
1495
        }
1496

1497
        // update image path and imagePullPolicy for main container
UNCOV
1498
        var mainContainer *corev1.Container
×
UNCOV
1499
        for i, ctr := range obj.Spec.Template.Spec.Containers {
×
UNCOV
1500
                if ctr.Name == "mps-control-daemon-ctr" {
×
UNCOV
1501
                        mainContainer = &obj.Spec.Template.Spec.Containers[i]
×
1502
                        break
×
1503
                }
1504
        }
1505
        if mainContainer == nil {
×
1506
                return fmt.Errorf("failed to find main container 'mps-control-daemon-ctr'")
×
1507
        }
×
UNCOV
1508
        mainContainer.Image = image
×
UNCOV
1509
        mainContainer.ImagePullPolicy = imagePullPolicy
×
UNCOV
1510

×
UNCOV
1511
        // set image pull secrets
×
UNCOV
1512
        if len(config.DevicePlugin.ImagePullSecrets) > 0 {
×
1513
                addPullSecrets(&obj.Spec.Template.Spec, config.DevicePlugin.ImagePullSecrets)
×
1514
        }
×
1515

1516
        // set resource limits
1517
        if config.DevicePlugin.Resources != nil {
×
1518
                // apply resource limits to all containers
×
UNCOV
1519
                for i := range obj.Spec.Template.Spec.Containers {
×
UNCOV
1520
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.DevicePlugin.Resources.Requests
×
1521
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.DevicePlugin.Resources.Limits
×
UNCOV
1522
                }
×
1523
        }
1524

1525
        // apply plugin configuration through ConfigMap if one is provided
UNCOV
1526
        err = handleDevicePluginConfig(obj, config)
×
UNCOV
1527
        if err != nil {
×
UNCOV
1528
                return err
×
UNCOV
1529
        }
×
1530

1531
        // set RuntimeClass for supported runtimes
UNCOV
1532
        setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
×
1533

×
1534
        // update env required for MIG support
×
1535
        applyMIGConfiguration(mainContainer, config.MIG.Strategy)
×
1536

×
1537
        // update MPS volumes if a custom MPS root is configured
×
1538
        if config.DevicePlugin.MPS != nil && config.DevicePlugin.MPS.Root != "" &&
×
UNCOV
1539
                config.DevicePlugin.MPS.Root != DefaultMPSRoot {
×
1540
                for i, volume := range obj.Spec.Template.Spec.Volumes {
×
1541
                        switch volume.Name {
×
1542
                        case "mps-root":
×
1543
                                obj.Spec.Template.Spec.Volumes[i].HostPath.Path = config.DevicePlugin.MPS.Root
×
1544
                        case "mps-shm":
×
1545
                                obj.Spec.Template.Spec.Volumes[i].HostPath.Path = filepath.Join(config.DevicePlugin.MPS.Root, "shm")
×
1546
                        }
1547
                }
1548
        }
1549

1550
        return nil
×
1551
}
1552

1553
// TransformSandboxDevicePlugin transforms sandbox-device-plugin daemonset with required config as per ClusterPolicy
1554
func TransformSandboxDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1555
        // update validation container
1✔
1556
        err := transformValidationInitContainer(obj, config)
1✔
1557
        if err != nil {
1✔
1558
                return err
×
1559
        }
×
1560
        // update image
1561
        image, err := gpuv1.ImagePath(&config.SandboxDevicePlugin)
1✔
1562
        if err != nil {
1✔
1563
                return err
×
1564
        }
×
1565
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
1566

1✔
1567
        // update image pull policy
1✔
1568
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.SandboxDevicePlugin.ImagePullPolicy)
1✔
1569
        // set image pull secrets
1✔
1570
        if len(config.SandboxDevicePlugin.ImagePullSecrets) > 0 {
2✔
1571
                addPullSecrets(&obj.Spec.Template.Spec, config.SandboxDevicePlugin.ImagePullSecrets)
1✔
1572
        }
1✔
1573
        // set resource limits
1574
        if config.SandboxDevicePlugin.Resources != nil {
1✔
1575
                // apply resource limits to all containers
×
1576
                for i := range obj.Spec.Template.Spec.Containers {
×
1577
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.SandboxDevicePlugin.Resources.Requests
×
1578
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.SandboxDevicePlugin.Resources.Limits
×
1579
                }
×
1580
        }
1581
        // set arguments if specified for device-plugin container
1582
        if len(config.SandboxDevicePlugin.Args) > 0 {
1✔
UNCOV
1583
                obj.Spec.Template.Spec.Containers[0].Args = config.SandboxDevicePlugin.Args
×
1584
        }
×
1585
        // set/append environment variables for device-plugin container
1586
        if len(config.SandboxDevicePlugin.Env) > 0 {
1✔
1587
                for _, env := range config.SandboxDevicePlugin.Env {
×
UNCOV
1588
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
×
UNCOV
1589
                }
×
1590
        }
1591
        return nil
1✔
1592
}
1593

1594
// TransformDRADriverKubeletPlugin transforms nvidia-dra-driver-kubelet-plugin daemonset with required config as per ClusterPolicy
1595
func TransformDRADriverKubeletPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1596
        err := transformValidationInitContainer(obj, config)
1✔
1597
        if err != nil {
1✔
NEW
1598
                return err
×
NEW
1599
        }
×
1600

1601
        if len(config.DRADriver.ImagePullSecrets) > 0 {
1✔
NEW
1602
                addPullSecrets(&obj.Spec.Template.Spec, config.DRADriver.ImagePullSecrets)
×
NEW
1603
        }
×
1604

1605
        image, err := gpuv1.ImagePath(&config.DRADriver)
1✔
1606
        if err != nil {
2✔
1607
                return err
1✔
1608
        }
1✔
1609

1610
        var containers []corev1.Container
1✔
1611
        for i, container := range obj.Spec.Template.Spec.Containers {
2✔
1612
                // Skip the container if the resource type is not enabled.
1✔
1613
                // As a result, the container will be removed from the spec.
1✔
1614
                if (container.Name == "gpus" && !config.DRADriver.IsGPUsEnabled()) ||
1✔
1615
                        (container.Name == "compute-domains" && !config.DRADriver.IsComputeDomainsEnabled()) {
2✔
1616
                        continue
1✔
1617
                }
1618

1619
                obj.Spec.Template.Spec.Containers[i].Image = image
1✔
1620
                obj.Spec.Template.Spec.Containers[i].ImagePullPolicy = gpuv1.ImagePullPolicy(config.DRADriver.ImagePullPolicy)
1✔
1621

1✔
1622
                if config.Toolkit.IsEnabled() {
2✔
1623
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[i]), NvidiaCTKPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-ctk"))
1✔
1624
                }
1✔
1625

1626
                // update the "gpus" container
1627
                if container.Name == "gpus" {
2✔
1628
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[i]), "IMAGE_NAME", image)
1✔
1629
                        if len(config.DRADriver.GPUs.KubeletPlugin.Env) > 0 {
2✔
1630
                                for _, env := range config.DRADriver.GPUs.KubeletPlugin.Env {
2✔
1631
                                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[i]), env.Name, env.Value)
1✔
1632
                                }
1✔
1633
                        }
1634

1635
                        if config.DRADriver.GPUs.KubeletPlugin.Resources != nil {
2✔
1636
                                obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.DRADriver.GPUs.KubeletPlugin.Resources.Requests
1✔
1637
                                obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.DRADriver.GPUs.KubeletPlugin.Resources.Limits
1✔
1638
                        }
1✔
1639
                }
1640

1641
                // update the "compute-domains" container
1642
                if container.Name == "compute-domains" {
2✔
1643
                        if len(config.DRADriver.ComputeDomains.KubeletPlugin.Env) > 0 {
2✔
1644
                                for _, env := range config.DRADriver.ComputeDomains.KubeletPlugin.Env {
2✔
1645
                                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[i]), env.Name, env.Value)
1✔
1646
                                }
1✔
1647
                        }
1648

1649
                        if config.DRADriver.ComputeDomains.KubeletPlugin.Resources != nil {
2✔
1650
                                obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.DRADriver.ComputeDomains.KubeletPlugin.Resources.Requests
1✔
1651
                                obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.DRADriver.ComputeDomains.KubeletPlugin.Resources.Limits
1✔
1652
                        }
1✔
1653
                }
1654

1655
                containers = append(containers, obj.Spec.Template.Spec.Containers[i])
1✔
1656
        }
1657
        obj.Spec.Template.Spec.Containers = containers
1✔
1658

1✔
1659
        return nil
1✔
1660
}
1661

1662
// TransformDCGMExporter transforms dcgm exporter daemonset with required config as per ClusterPolicy
1663
func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1664
        // update validation container
1✔
1665
        err := transformValidationInitContainer(obj, config)
1✔
1666
        if err != nil {
1✔
1667
                return err
×
1668
        }
×
1669

1670
        // update image
1671
        image, err := gpuv1.ImagePath(&config.DCGMExporter)
1✔
1672
        if err != nil {
1✔
1673
                return err
×
1674
        }
×
1675
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
1676

1✔
1677
        // update image pull policy
1✔
1678
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.DCGMExporter.ImagePullPolicy)
1✔
1679
        // set image pull secrets
1✔
1680
        if len(config.DCGMExporter.ImagePullSecrets) > 0 {
2✔
1681
                addPullSecrets(&obj.Spec.Template.Spec, config.DCGMExporter.ImagePullSecrets)
1✔
1682
        }
1✔
1683
        // set resource limits
1684
        if config.DCGMExporter.Resources != nil {
1✔
1685
                // apply resource limits to all containers
×
1686
                for i := range obj.Spec.Template.Spec.Containers {
×
1687
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.DCGMExporter.Resources.Requests
×
1688
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.DCGMExporter.Resources.Limits
×
1689
                }
×
1690
        }
1691
        // set arguments if specified for exporter container
1692
        if len(config.DCGMExporter.Args) > 0 {
2✔
1693
                obj.Spec.Template.Spec.Containers[0].Args = config.DCGMExporter.Args
1✔
1694
        }
1✔
1695

1696
        // check if DCGM hostengine is enabled as a separate Pod and setup env accordingly
1697
        if config.DCGM.IsEnabled() {
2✔
1698
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DCGMRemoteEngineEnvName, fmt.Sprintf("nvidia-dcgm:%d", DCGMDefaultPort))
1✔
1699
        } else {
1✔
1700
                // case for DCGM running on the host itself(DGX BaseOS)
×
1701
                remoteEngine := getContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DCGMRemoteEngineEnvName)
×
1702
                if remoteEngine != "" && strings.HasPrefix(remoteEngine, "localhost") {
×
1703
                        // enable hostNetwork for communication with external DCGM using localhost
×
1704
                        obj.Spec.Template.Spec.HostNetwork = true
×
1705
                }
×
1706
        }
1707

1708
        // set RuntimeClass for supported runtimes
1709
        setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
1✔
1710

1✔
1711
        // mount configmap for custom metrics if provided by user
1✔
1712
        if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" {
1✔
1713
                metricsConfigVolMount := corev1.VolumeMount{Name: "metrics-config", ReadOnly: true, MountPath: MetricsConfigMountPath, SubPath: MetricsConfigFileName}
×
1714
                obj.Spec.Template.Spec.Containers[0].VolumeMounts = append(obj.Spec.Template.Spec.Containers[0].VolumeMounts, metricsConfigVolMount)
×
1715

×
1716
                metricsConfigVolumeSource := corev1.VolumeSource{
×
1717
                        ConfigMap: &corev1.ConfigMapVolumeSource{
×
1718
                                LocalObjectReference: corev1.LocalObjectReference{
×
1719
                                        Name: config.DCGMExporter.MetricsConfig.Name,
×
UNCOV
1720
                                },
×
UNCOV
1721
                                Items: []corev1.KeyToPath{
×
UNCOV
1722
                                        {
×
UNCOV
1723
                                                Key:  MetricsConfigFileName,
×
UNCOV
1724
                                                Path: MetricsConfigFileName,
×
1725
                                        },
×
1726
                                },
×
UNCOV
1727
                        },
×
UNCOV
1728
                }
×
UNCOV
1729
                metricsConfigVol := corev1.Volume{Name: "metrics-config", VolumeSource: metricsConfigVolumeSource}
×
UNCOV
1730
                obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, metricsConfigVol)
×
1731

×
1732
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DCGM_EXPORTER_COLLECTORS", MetricsConfigMountPath)
×
UNCOV
1733
        }
×
1734

1735
        release, err := parseOSRelease()
1✔
1736
        if err != nil {
1✔
UNCOV
1737
                return fmt.Errorf("ERROR: failed to get os-release: %s", err)
×
UNCOV
1738
        }
×
1739

1740
        // skip SELinux changes if not an OCP cluster
1741
        if _, ok := release["OPENSHIFT_VERSION"]; !ok {
2✔
1742
                return nil
1✔
1743
        }
1✔
1744

1745
        // Add initContainer for OCP to set proper SELinux context on /var/lib/kubelet/pod-resources
1746
        initImage, err := gpuv1.ImagePath(&config.Operator.InitContainer)
×
1747
        if err != nil {
×
UNCOV
1748
                return err
×
UNCOV
1749
        }
×
1750

UNCOV
1751
        initContainer := corev1.Container{}
×
UNCOV
1752
        if initImage != "" {
×
UNCOV
1753
                initContainer.Image = initImage
×
UNCOV
1754
        }
×
UNCOV
1755
        initContainer.Name = "init-pod-nvidia-node-status-exporter"
×
UNCOV
1756
        initContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.Operator.InitContainer.ImagePullPolicy)
×
UNCOV
1757
        initContainer.Command = []string{"/bin/entrypoint.sh"}
×
1758

×
1759
        // need CAP_SYS_ADMIN privileges for collecting pod specific resources
×
1760
        privileged := true
×
1761
        securityContext := &corev1.SecurityContext{
×
1762
                Privileged: &privileged,
×
1763
        }
×
UNCOV
1764

×
UNCOV
1765
        initContainer.SecurityContext = securityContext
×
UNCOV
1766

×
UNCOV
1767
        // Disable all constraints on the configurations required by NVIDIA container toolkit
×
UNCOV
1768
        setContainerEnv(&initContainer, NvidiaDisableRequireEnvName, "true")
×
UNCOV
1769

×
UNCOV
1770
        volMountSockName, volMountSockPath := "pod-gpu-resources", "/var/lib/kubelet/pod-resources"
×
1771
        volMountSock := corev1.VolumeMount{Name: volMountSockName, MountPath: volMountSockPath}
×
1772
        initContainer.VolumeMounts = append(initContainer.VolumeMounts, volMountSock)
×
1773

×
1774
        volMountConfigName, volMountConfigPath, volMountConfigSubPath := "init-config", "/bin/entrypoint.sh", "entrypoint.sh"
×
1775
        volMountConfig := corev1.VolumeMount{Name: volMountConfigName, ReadOnly: true, MountPath: volMountConfigPath, SubPath: volMountConfigSubPath}
×
1776
        initContainer.VolumeMounts = append(initContainer.VolumeMounts, volMountConfig)
×
1777

×
1778
        obj.Spec.Template.Spec.InitContainers = append(obj.Spec.Template.Spec.InitContainers, initContainer)
×
1779

×
1780
        volMountConfigKey, volMountConfigDefaultMode := "nvidia-dcgm-exporter", int32(0700)
×
1781
        initVol := corev1.Volume{Name: volMountConfigName, VolumeSource: corev1.VolumeSource{ConfigMap: &corev1.ConfigMapVolumeSource{LocalObjectReference: corev1.LocalObjectReference{Name: volMountConfigKey}, DefaultMode: &volMountConfigDefaultMode}}}
×
1782
        obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, initVol)
×
1783

×
1784
        if len(config.DCGMExporter.Env) > 0 {
×
1785
                for _, env := range config.DCGMExporter.Env {
×
1786
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
×
1787
                }
×
1788
        }
1789

1790
        return nil
×
1791
}
1792

1793
// TransformDCGM transforms dcgm daemonset with required config as per ClusterPolicy
UNCOV
1794
func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
×
1795
        // update validation container
×
1796
        err := transformValidationInitContainer(obj, config)
×
UNCOV
1797
        if err != nil {
×
UNCOV
1798
                return err
×
UNCOV
1799
        }
×
1800
        // update image
UNCOV
1801
        image, err := gpuv1.ImagePath(&config.DCGM)
×
UNCOV
1802
        if err != nil {
×
UNCOV
1803
                return err
×
1804
        }
×
1805
        obj.Spec.Template.Spec.Containers[0].Image = image
×
1806
        // update image pull policy
×
1807
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.DCGM.ImagePullPolicy)
×
UNCOV
1808
        // set image pull secrets
×
1809
        if len(config.DCGM.ImagePullSecrets) > 0 {
×
1810
                addPullSecrets(&obj.Spec.Template.Spec, config.DCGM.ImagePullSecrets)
×
1811
        }
×
1812
        // set resource limits
1813
        if config.DCGM.Resources != nil {
×
1814
                // apply resource limits to all containers
×
1815
                for i := range obj.Spec.Template.Spec.Containers {
×
1816
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.DCGM.Resources.Requests
×
1817
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.DCGM.Resources.Limits
×
1818
                }
×
1819
        }
1820
        // set arguments if specified for exporter container
1821
        if len(config.DCGM.Args) > 0 {
×
1822
                obj.Spec.Template.Spec.Containers[0].Args = config.DCGM.Args
×
1823
        }
×
1824
        // set/append environment variables for exporter container
1825
        if len(config.DCGM.Env) > 0 {
×
1826
                for _, env := range config.DCGM.Env {
×
1827
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
×
1828
                }
×
1829
        }
1830

1831
        // set RuntimeClass for supported runtimes
1832
        setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
×
1833

×
1834
        return nil
×
1835
}
1836

1837
// TransformMIGManager transforms MIG Manager daemonset with required config as per ClusterPolicy
1838
func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1839
        // update validation container
1✔
1840
        err := transformValidationInitContainer(obj, config)
1✔
1841
        if err != nil {
1✔
1842
                return err
×
1843
        }
×
1844

1845
        // update image
1846
        image, err := gpuv1.ImagePath(&config.MIGManager)
1✔
1847
        if err != nil {
1✔
1848
                return err
×
UNCOV
1849
        }
×
1850
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
1851

1✔
1852
        // update image pull policy
1✔
1853
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.MIGManager.ImagePullPolicy)
1✔
1854

1✔
1855
        // set image pull secrets
1✔
1856
        if len(config.MIGManager.ImagePullSecrets) > 0 {
2✔
1857
                addPullSecrets(&obj.Spec.Template.Spec, config.MIGManager.ImagePullSecrets)
1✔
1858
        }
1✔
1859

1860
        // set resource limits
1861
        if config.MIGManager.Resources != nil {
1✔
1862
                // apply resource limits to all containers
×
1863
                for i := range obj.Spec.Template.Spec.Containers {
×
1864
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.MIGManager.Resources.Requests
×
1865
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.MIGManager.Resources.Limits
×
1866
                }
×
1867
        }
1868

1869
        // set arguments if specified for mig-manager container
1870
        if len(config.MIGManager.Args) > 0 {
2✔
1871
                obj.Spec.Template.Spec.Containers[0].Args = config.MIGManager.Args
1✔
1872
        }
1✔
1873

1874
        // set RuntimeClass for supported runtimes
1875
        setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
1✔
1876

1✔
1877
        // set ConfigMap name for "mig-parted-config" Volume
1✔
1878
        for i, vol := range obj.Spec.Template.Spec.Volumes {
1✔
1879
                if !strings.Contains(vol.Name, "mig-parted-config") {
×
1880
                        continue
×
1881
                }
1882

1883
                name := MigPartedDefaultConfigMapName
×
1884
                if config.MIGManager.Config != nil && config.MIGManager.Config.Name != "" && config.MIGManager.Config.Name != MigPartedDefaultConfigMapName {
×
1885
                        name = config.MIGManager.Config.Name
×
1886
                }
×
UNCOV
1887
                obj.Spec.Template.Spec.Volumes[i].ConfigMap.Name = name
×
UNCOV
1888
                break
×
1889
        }
1890

1891
        // set ConfigMap name for "gpu-clients" Volume
1892
        for i, vol := range obj.Spec.Template.Spec.Volumes {
1✔
UNCOV
1893
                if !strings.Contains(vol.Name, "gpu-clients") {
×
UNCOV
1894
                        continue
×
1895
                }
1896

UNCOV
1897
                name := MigDefaultGPUClientsConfigMapName
×
UNCOV
1898
                if config.MIGManager.GPUClientsConfig != nil && config.MIGManager.GPUClientsConfig.Name != "" {
×
UNCOV
1899
                        name = config.MIGManager.GPUClientsConfig.Name
×
1900
                }
×
1901
                obj.Spec.Template.Spec.Volumes[i].ConfigMap.Name = name
×
UNCOV
1902
                break
×
1903
        }
1904

1905
        // update env required for CDI support
1906
        if config.CDI.IsEnabled() {
1✔
1907
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true")
×
UNCOV
1908
                if config.Toolkit.IsEnabled() {
×
UNCOV
1909
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook"))
×
UNCOV
1910
                }
×
1911
        }
1912

1913
        if len(config.MIGManager.Env) > 0 {
2✔
1914
                for _, env := range config.MIGManager.Env {
2✔
1915
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
1✔
1916
                }
1✔
1917
        }
1918

1919
        return nil
1✔
1920
}
1921

1922
// TransformKataManager transforms Kata Manager daemonset with required config as per ClusterPolicy
1923
func TransformKataManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
1924
        // update image
1✔
1925
        image, err := gpuv1.ImagePath(&config.KataManager)
1✔
1926
        if err != nil {
1✔
UNCOV
1927
                return err
×
UNCOV
1928
        }
×
1929
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
1930

1✔
1931
        // update image pull policy
1✔
1932
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.KataManager.ImagePullPolicy)
1✔
1933

1✔
1934
        // set image pull secrets
1✔
1935
        if len(config.KataManager.ImagePullSecrets) > 0 {
2✔
1936
                addPullSecrets(&obj.Spec.Template.Spec, config.KataManager.ImagePullSecrets)
1✔
1937
        }
1✔
1938

1939
        // set resource limits
1940
        if config.KataManager.Resources != nil {
1✔
1941
                // apply resource limits to all containers
×
1942
                for i := range obj.Spec.Template.Spec.Containers {
×
1943
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.KataManager.Resources.Requests
×
1944
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.KataManager.Resources.Limits
×
1945
                }
×
1946
        }
1947

1948
        // set arguments if specified for mig-manager container
1949
        if len(config.KataManager.Args) > 0 {
2✔
1950
                obj.Spec.Template.Spec.Containers[0].Args = config.KataManager.Args
1✔
1951
        }
1✔
1952

1953
        // mount artifactsDir
1954
        artifactsDir := DefaultKataArtifactsDir
1✔
1955
        if config.KataManager.Config.ArtifactsDir != "" {
2✔
1956
                artifactsDir = config.KataManager.Config.ArtifactsDir
1✔
1957
        }
1✔
1958

1959
        // set env used by readinessProbe to determine path to kata-manager pid file.
1960
        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "KATA_ARTIFACTS_DIR", artifactsDir)
1✔
1961

1✔
1962
        artifactsVolMount := corev1.VolumeMount{Name: "kata-artifacts", MountPath: artifactsDir}
1✔
1963
        obj.Spec.Template.Spec.Containers[0].VolumeMounts = append(obj.Spec.Template.Spec.Containers[0].VolumeMounts, artifactsVolMount)
1✔
1964

1✔
1965
        artifactsVol := corev1.Volume{Name: "kata-artifacts", VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: artifactsDir, Type: newHostPathType(corev1.HostPathDirectoryOrCreate)}}}
1✔
1966
        obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, artifactsVol)
1✔
1967

1✔
1968
        // mount containerd config and socket
1✔
1969

1✔
1970
        // setup mounts for runtime config file
1✔
1971
        runtime := n.runtime.String()
1✔
1972
        err = transformForRuntime(obj, config, runtime, "nvidia-kata-manager")
1✔
1973
        if err != nil {
1✔
UNCOV
1974
                return fmt.Errorf("error transforming kata-manager daemonset : %w", err)
×
UNCOV
1975
        }
×
1976

1977
        // Compute hash of kata manager config and add an annotation with the value.
1978
        // If the kata config changes, a new revision of the daemonset will be
1979
        // created and thus the kata-manager pods will restart with the updated config.
1980
        hash := utils.GetObjectHash(config.KataManager.Config)
1✔
1981

1✔
1982
        if obj.Spec.Template.Annotations == nil {
2✔
1983
                obj.Spec.Template.Annotations = make(map[string]string)
1✔
1984
        }
1✔
1985
        obj.Spec.Template.Annotations[KataManagerAnnotationHashKey] = hash
1✔
1986

1✔
1987
        if len(config.KataManager.Env) > 0 {
2✔
1988
                for _, env := range config.KataManager.Env {
2✔
1989
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
1✔
1990
                }
1✔
1991
        }
1992

1993
        return nil
1✔
1994
}
1995

1996
// TransformVFIOManager transforms VFIO-PCI Manager daemonset with required config as per ClusterPolicy
UNCOV
1997
func TransformVFIOManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
×
UNCOV
1998
        // update k8s-driver-manager initContainer
×
1999
        err := transformDriverManagerInitContainer(obj, &config.VFIOManager.DriverManager, nil)
×
2000
        if err != nil {
×
2001
                return fmt.Errorf("failed to transform k8s-driver-manager initContainer for VFIO Manager: %v", err)
×
2002
        }
×
2003

2004
        // update image
UNCOV
2005
        image, err := gpuv1.ImagePath(&config.VFIOManager)
×
UNCOV
2006
        if err != nil {
×
UNCOV
2007
                return err
×
UNCOV
2008
        }
×
UNCOV
2009
        obj.Spec.Template.Spec.Containers[0].Image = image
×
UNCOV
2010

×
UNCOV
2011
        // update image pull policy
×
UNCOV
2012
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.VFIOManager.ImagePullPolicy)
×
UNCOV
2013

×
UNCOV
2014
        // set image pull secrets
×
UNCOV
2015
        if len(config.VFIOManager.ImagePullSecrets) > 0 {
×
UNCOV
2016
                addPullSecrets(&obj.Spec.Template.Spec, config.VFIOManager.ImagePullSecrets)
×
UNCOV
2017
        }
×
2018

2019
        // set resource limits
UNCOV
2020
        if config.VFIOManager.Resources != nil {
×
UNCOV
2021
                // apply resource limits to all containers
×
UNCOV
2022
                for i := range obj.Spec.Template.Spec.Containers {
×
UNCOV
2023
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.VFIOManager.Resources.Requests
×
UNCOV
2024
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.VFIOManager.Resources.Limits
×
UNCOV
2025
                }
×
2026
        }
2027

2028
        // set arguments if specified for mig-manager container
UNCOV
2029
        if len(config.VFIOManager.Args) > 0 {
×
UNCOV
2030
                obj.Spec.Template.Spec.Containers[0].Args = config.VFIOManager.Args
×
UNCOV
2031
        }
×
2032

2033
        // set/append environment variables for mig-manager container
UNCOV
2034
        if len(config.VFIOManager.Env) > 0 {
×
UNCOV
2035
                for _, env := range config.VFIOManager.Env {
×
UNCOV
2036
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
×
UNCOV
2037
                }
×
2038
        }
2039

UNCOV
2040
        return nil
×
2041
}
2042

2043
// TransformCCManager transforms CC Manager daemonset with required config as per ClusterPolicy
UNCOV
2044
func TransformCCManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
×
UNCOV
2045
        // update image
×
UNCOV
2046
        image, err := gpuv1.ImagePath(&config.CCManager)
×
UNCOV
2047
        if err != nil {
×
UNCOV
2048
                return err
×
UNCOV
2049
        }
×
UNCOV
2050
        obj.Spec.Template.Spec.Containers[0].Image = image
×
UNCOV
2051

×
UNCOV
2052
        // update image pull policy
×
UNCOV
2053
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.CCManager.ImagePullPolicy)
×
UNCOV
2054

×
2055
        // set image pull secrets
×
2056
        if len(config.CCManager.ImagePullSecrets) > 0 {
×
2057
                addPullSecrets(&obj.Spec.Template.Spec, config.CCManager.ImagePullSecrets)
×
2058
        }
×
2059

2060
        // set resource limits
UNCOV
2061
        if config.CCManager.Resources != nil {
×
UNCOV
2062
                // apply resource limits to all containers
×
2063
                for i := range obj.Spec.Template.Spec.Containers {
×
2064
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.CCManager.Resources.Requests
×
2065
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.CCManager.Resources.Limits
×
2066
                }
×
2067
        }
2068

2069
        // set arguments if specified for cc-manager container
2070
        if len(config.CCManager.Args) > 0 {
×
2071
                obj.Spec.Template.Spec.Containers[0].Args = config.CCManager.Args
×
2072
        }
×
2073

2074
        // set default cc mode env
2075
        if config.CCManager.DefaultMode != "" {
×
UNCOV
2076
                setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DefaultCCModeEnvName, config.CCManager.DefaultMode)
×
UNCOV
2077
        }
×
2078

2079
        // set/append environment variables for cc-manager container
2080
        if len(config.CCManager.Env) > 0 {
×
2081
                for _, env := range config.CCManager.Env {
×
2082
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
×
2083
                }
×
2084
        }
2085

UNCOV
2086
        return nil
×
2087
}
2088

2089
// TransformVGPUDeviceManager transforms VGPU Device Manager daemonset with required config as per ClusterPolicy
UNCOV
2090
func TransformVGPUDeviceManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
×
UNCOV
2091
        // update validation container
×
2092
        err := transformValidationInitContainer(obj, config)
×
2093
        if err != nil {
×
2094
                return err
×
2095
        }
×
2096

2097
        // update image
2098
        image, err := gpuv1.ImagePath(&config.VGPUDeviceManager)
×
UNCOV
2099
        if err != nil {
×
UNCOV
2100
                return err
×
UNCOV
2101
        }
×
2102
        obj.Spec.Template.Spec.Containers[0].Image = image
×
2103

×
2104
        // update image pull policy
×
2105
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.VGPUDeviceManager.ImagePullPolicy)
×
2106

×
2107
        // set image pull secrets
×
2108
        if len(config.VGPUDeviceManager.ImagePullSecrets) > 0 {
×
2109
                addPullSecrets(&obj.Spec.Template.Spec, config.VGPUDeviceManager.ImagePullSecrets)
×
2110
        }
×
2111

2112
        // set resource limits
2113
        if config.VGPUDeviceManager.Resources != nil {
×
2114
                // apply resource limits to all containers
×
2115
                for i := range obj.Spec.Template.Spec.Containers {
×
2116
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.VGPUDeviceManager.Resources.Requests
×
UNCOV
2117
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.VGPUDeviceManager.Resources.Limits
×
UNCOV
2118
                }
×
2119
        }
2120

2121
        // set arguments if specified for mig-manager container
2122
        if len(config.VGPUDeviceManager.Args) > 0 {
×
2123
                obj.Spec.Template.Spec.Containers[0].Args = config.VGPUDeviceManager.Args
×
2124
        }
×
2125

2126
        // set/append environment variables for mig-manager container
UNCOV
2127
        if len(config.VGPUDeviceManager.Env) > 0 {
×
2128
                for _, env := range config.VGPUDeviceManager.Env {
×
2129
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
×
2130
                }
×
2131
        }
2132

2133
        // set ConfigMap name for "vgpu-config" Volume
2134
        for i, vol := range obj.Spec.Template.Spec.Volumes {
×
2135
                if !strings.Contains(vol.Name, "vgpu-config") {
×
UNCOV
2136
                        continue
×
2137
                }
2138

2139
                name := VgpuDMDefaultConfigMapName
×
2140
                if config.VGPUDeviceManager.Config != nil && config.VGPUDeviceManager.Config.Name != "" {
×
2141
                        name = config.VGPUDeviceManager.Config.Name
×
UNCOV
2142
                }
×
UNCOV
2143
                obj.Spec.Template.Spec.Volumes[i].ConfigMap.Name = name
×
2144
                break
×
2145
        }
2146

2147
        // set name of default vGPU device configuration. The default configuration is applied if the node
2148
        // is not labelled with a specific configuration
2149
        defaultConfig := VgpuDMDefaultConfigName
×
2150
        if config.VGPUDeviceManager.Config != nil && config.VGPUDeviceManager.Config.Default != "" {
×
2151
                defaultConfig = config.VGPUDeviceManager.Config.Default
×
2152
        }
×
2153
        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DEFAULT_VGPU_CONFIG", defaultConfig)
×
UNCOV
2154

×
UNCOV
2155
        return nil
×
2156
}
2157

2158
// transformValidatorSecurityContext updates the security context for a validator
2159
// container so that it runs as uid 0. Some of the validations run commands
2160
// that require root privileges (e.g. chroot). In addition, all validations
2161
// create / delete status files in the '/run/nvidia/validations' host path
2162
// volume. This directory is initially created by the kubelet and thus has
2163
// the same group and ownership as the kubelet.
2164
func transformValidatorSecurityContext(ctr *corev1.Container) {
1✔
2165
        if ctr.SecurityContext == nil {
2✔
2166
                ctr.SecurityContext = &corev1.SecurityContext{}
1✔
2167
        }
1✔
2168
        ctr.SecurityContext.RunAsUser = rootUID
1✔
2169
}
2170

2171
// TransformValidator transforms nvidia-operator-validator daemonset with required config as per ClusterPolicy
2172
func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
2173
        err := TransformValidatorShared(obj, config)
1✔
2174
        if err != nil {
2✔
2175
                return fmt.Errorf("%v", err)
1✔
2176
        }
1✔
2177

2178
        // set RuntimeClass for supported runtimes
2179
        setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
1✔
2180

1✔
2181
        var validatorErr error
1✔
2182
        // apply changes for individual component validators(initContainers)
1✔
2183
        components := []string{
1✔
2184
                "driver",
1✔
2185
                "nvidia-fs",
1✔
2186
                "toolkit",
1✔
2187
                "cuda",
1✔
2188
                "plugin",
1✔
2189
        }
1✔
2190

1✔
2191
        for _, component := range components {
2✔
2192
                if err := TransformValidatorComponent(config, &obj.Spec.Template.Spec, component); err != nil {
1✔
2193
                        validatorErr = errors.Join(validatorErr, err)
×
2194
                }
×
2195
        }
2196

2197
        if validatorErr != nil {
1✔
2198
                n.logger.Info("WARN: errors transforming the validator containers: %v", validatorErr)
×
2199
        }
×
2200

2201
        return nil
1✔
2202
}
2203

2204
// TransformSandboxValidator transforms nvidia-sandbox-validator daemonset with required config as per ClusterPolicy
2205
func TransformSandboxValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
2206
        err := TransformValidatorShared(obj, config)
1✔
2207
        if err != nil {
2✔
2208
                return fmt.Errorf("%v", err)
1✔
2209
        }
1✔
2210

2211
        var validatorErr error
1✔
2212
        // apply changes for individual component validators(initContainers)
1✔
2213
        components := []string{
1✔
2214
                "cc-manager",
1✔
2215
                "vfio-pci",
1✔
2216
                "vgpu-manager",
1✔
2217
                "vgpu-devices",
1✔
2218
        }
1✔
2219

1✔
2220
        for _, component := range components {
2✔
2221
                if err := TransformValidatorComponent(config, &obj.Spec.Template.Spec, component); err != nil {
1✔
UNCOV
2222
                        validatorErr = errors.Join(validatorErr, err)
×
UNCOV
2223
                }
×
2224
        }
2225

2226
        if validatorErr != nil {
1✔
UNCOV
2227
                n.logger.Info("WARN: errors transforming the validator containers: %v", validatorErr)
×
UNCOV
2228
        }
×
2229

2230
        return nil
1✔
2231
}
2232

2233
// TransformValidatorShared applies general transformations to the validator daemonset with required config as per ClusterPolicy
2234
func TransformValidatorShared(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) error {
1✔
2235
        // update image
1✔
2236
        image, err := gpuv1.ImagePath(&config.Validator)
1✔
2237
        if err != nil {
2✔
2238
                return err
1✔
2239
        }
1✔
2240
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
2241
        // update image pull policy
1✔
2242
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.Validator.ImagePullPolicy)
1✔
2243
        // set image pull secrets
1✔
2244
        if len(config.Validator.ImagePullSecrets) > 0 {
2✔
2245
                addPullSecrets(&obj.Spec.Template.Spec, config.Validator.ImagePullSecrets)
1✔
2246
        }
1✔
2247
        // set resource limits
2248
        if config.Validator.Resources != nil {
2✔
2249
                // apply resource limits to all containers
1✔
2250
                for i := range obj.Spec.Template.Spec.Containers {
2✔
2251
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.Validator.Resources.Requests
1✔
2252
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.Validator.Resources.Limits
1✔
2253
                }
1✔
2254
        }
2255
        // set arguments if specified for validator container
2256
        if len(config.Validator.Args) > 0 {
2✔
2257
                obj.Spec.Template.Spec.Containers[0].Args = config.Validator.Args
1✔
2258
        }
1✔
2259
        // set/append environment variables for validator container
2260
        if len(config.Validator.Env) > 0 {
2✔
2261
                for _, env := range config.Validator.Env {
2✔
2262
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
1✔
2263
                }
1✔
2264
        }
2265
        // update the security context for the validator container
2266
        transformValidatorSecurityContext(&obj.Spec.Template.Spec.Containers[0])
1✔
2267

1✔
2268
        return nil
1✔
2269
}
2270

2271
// TransformValidatorComponent applies changes to given validator component
2272
func TransformValidatorComponent(config *gpuv1.ClusterPolicySpec, podSpec *corev1.PodSpec, component string) error {
1✔
2273
        for i, initContainer := range podSpec.InitContainers {
2✔
2274
                // skip if not component validation initContainer
1✔
2275
                if !strings.Contains(initContainer.Name, fmt.Sprintf("%s-validation", component)) {
2✔
2276
                        continue
1✔
2277
                }
2278
                // update validation image
2279
                image, err := gpuv1.ImagePath(&config.Validator)
1✔
2280
                if err != nil {
2✔
2281
                        return err
1✔
2282
                }
1✔
2283
                podSpec.InitContainers[i].Image = image
1✔
2284
                // update validation image pull policy
1✔
2285
                if config.Validator.ImagePullPolicy != "" {
2✔
2286
                        podSpec.InitContainers[i].ImagePullPolicy = gpuv1.ImagePullPolicy(config.Validator.ImagePullPolicy)
1✔
2287
                }
1✔
2288
                // update the security context for the validator container
2289
                transformValidatorSecurityContext(&podSpec.InitContainers[i])
1✔
2290

1✔
2291
                switch component {
1✔
2292
                case "cuda":
1✔
2293
                        // set additional env to indicate image, pullSecrets to spin-off cuda validation workload pod.
1✔
2294
                        setContainerEnv(&(podSpec.InitContainers[i]), ValidatorImageEnvName, image)
1✔
2295
                        setContainerEnv(&(podSpec.InitContainers[i]), ValidatorImagePullPolicyEnvName, config.Validator.ImagePullPolicy)
1✔
2296
                        var pullSecrets string
1✔
2297
                        if len(config.Validator.ImagePullSecrets) > 0 {
2✔
2298
                                pullSecrets = strings.Join(config.Validator.ImagePullSecrets, ",")
1✔
2299
                                setContainerEnv(&(podSpec.InitContainers[i]), ValidatorImagePullSecretsEnvName, pullSecrets)
1✔
2300
                        }
1✔
2301
                        if podSpec.RuntimeClassName != nil {
2✔
2302
                                setContainerEnv(&(podSpec.InitContainers[i]), ValidatorRuntimeClassEnvName, *podSpec.RuntimeClassName)
1✔
2303
                        }
1✔
2304
                        // set/append environment variables for cuda-validation container
2305
                        if len(config.Validator.CUDA.Env) > 0 {
2✔
2306
                                for _, env := range config.Validator.CUDA.Env {
2✔
2307
                                        setContainerEnv(&(podSpec.InitContainers[i]), env.Name, env.Value)
1✔
2308
                                }
1✔
2309
                        }
2310
                case "plugin":
1✔
2311
                        // remove plugin init container from validator Daemonset if it is not enabled
1✔
2312
                        if !config.DevicePlugin.IsEnabled() {
2✔
2313
                                podSpec.InitContainers = append(podSpec.InitContainers[:i], podSpec.InitContainers[i+1:]...)
1✔
2314
                                return nil
1✔
2315
                        }
1✔
2316
                        // set additional env to indicate image, pullSecrets to spin-off plugin validation workload pod.
2317
                        setContainerEnv(&(podSpec.InitContainers[i]), ValidatorImageEnvName, image)
1✔
2318
                        setContainerEnv(&(podSpec.InitContainers[i]), ValidatorImagePullPolicyEnvName, config.Validator.ImagePullPolicy)
1✔
2319
                        var pullSecrets string
1✔
2320
                        if len(config.Validator.ImagePullSecrets) > 0 {
2✔
2321
                                pullSecrets = strings.Join(config.Validator.ImagePullSecrets, ",")
1✔
2322
                                setContainerEnv(&(podSpec.InitContainers[i]), ValidatorImagePullSecretsEnvName, pullSecrets)
1✔
2323
                        }
1✔
2324
                        if podSpec.RuntimeClassName != nil {
2✔
2325
                                setContainerEnv(&(podSpec.InitContainers[i]), ValidatorRuntimeClassEnvName, *podSpec.RuntimeClassName)
1✔
2326
                        }
1✔
2327
                        // apply mig-strategy env to spin off plugin-validation workload pod
2328
                        setContainerEnv(&(podSpec.InitContainers[i]), MigStrategyEnvName, string(config.MIG.Strategy))
1✔
2329
                        // set/append environment variables for plugin-validation container
1✔
2330
                        if len(config.Validator.Plugin.Env) > 0 {
2✔
2331
                                for _, env := range config.Validator.Plugin.Env {
2✔
2332
                                        setContainerEnv(&(podSpec.InitContainers[i]), env.Name, env.Value)
1✔
2333
                                }
1✔
2334
                        }
2335
                case "driver":
1✔
2336
                        // set/append environment variables for driver-validation container
1✔
2337
                        if len(config.Validator.Driver.Env) > 0 {
2✔
2338
                                for _, env := range config.Validator.Driver.Env {
2✔
2339
                                        setContainerEnv(&(podSpec.InitContainers[i]), env.Name, env.Value)
1✔
2340
                                }
1✔
2341
                        }
2342
                case "nvidia-fs":
1✔
2343
                        if config.GPUDirectStorage == nil || !config.GPUDirectStorage.IsEnabled() {
2✔
2344
                                // remove  nvidia-fs init container from validator Daemonset if GDS is not enabled
1✔
2345
                                podSpec.InitContainers = append(podSpec.InitContainers[:i], podSpec.InitContainers[i+1:]...)
1✔
2346
                                return nil
1✔
2347
                        }
1✔
2348
                case "cc-manager":
1✔
2349
                        if !config.CCManager.IsEnabled() {
2✔
2350
                                // remove  cc-manager init container from validator Daemonset if it is not enabled
1✔
2351
                                podSpec.InitContainers = append(podSpec.InitContainers[:i], podSpec.InitContainers[i+1:]...)
1✔
2352
                                return nil
1✔
2353
                        }
1✔
2354
                case "toolkit":
1✔
2355
                        // set/append environment variables for toolkit-validation container
1✔
2356
                        if len(config.Validator.Toolkit.Env) > 0 {
2✔
2357
                                for _, env := range config.Validator.Toolkit.Env {
2✔
2358
                                        setContainerEnv(&(podSpec.InitContainers[i]), env.Name, env.Value)
1✔
2359
                                }
1✔
2360
                        }
2361
                case "vfio-pci":
1✔
2362
                        // set/append environment variables for vfio-pci-validation container
1✔
2363
                        setContainerEnv(&(podSpec.InitContainers[i]), "DEFAULT_GPU_WORKLOAD_CONFIG", defaultGPUWorkloadConfig)
1✔
2364
                        if len(config.Validator.VFIOPCI.Env) > 0 {
2✔
2365
                                for _, env := range config.Validator.VFIOPCI.Env {
2✔
2366
                                        setContainerEnv(&(podSpec.InitContainers[i]), env.Name, env.Value)
1✔
2367
                                }
1✔
2368
                        }
2369
                case "vgpu-manager":
1✔
2370
                        // set/append environment variables for vgpu-manager-validation container
1✔
2371
                        setContainerEnv(&(podSpec.InitContainers[i]), "DEFAULT_GPU_WORKLOAD_CONFIG", defaultGPUWorkloadConfig)
1✔
2372
                        if len(config.Validator.VGPUManager.Env) > 0 {
2✔
2373
                                for _, env := range config.Validator.VGPUManager.Env {
2✔
2374
                                        setContainerEnv(&(podSpec.InitContainers[i]), env.Name, env.Value)
1✔
2375
                                }
1✔
2376
                        }
2377
                case "vgpu-devices":
1✔
2378
                        // set/append environment variables for vgpu-devices-validation container
1✔
2379
                        setContainerEnv(&(podSpec.InitContainers[i]), "DEFAULT_GPU_WORKLOAD_CONFIG", defaultGPUWorkloadConfig)
1✔
2380
                        if len(config.Validator.VGPUDevices.Env) > 0 {
2✔
2381
                                for _, env := range config.Validator.VGPUDevices.Env {
2✔
2382
                                        setContainerEnv(&(podSpec.InitContainers[i]), env.Name, env.Value)
1✔
2383
                                }
1✔
2384
                        }
UNCOV
2385
                default:
×
UNCOV
2386
                        return fmt.Errorf("invalid component provided to apply validator changes")
×
2387
                }
2388
        }
2389
        return nil
1✔
2390
}
2391

2392
// TransformNodeStatusExporter transforms the node-status-exporter daemonset with required config as per ClusterPolicy
2393
func TransformNodeStatusExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
2394
        // update validation container
1✔
2395
        err := transformValidationInitContainer(obj, config)
1✔
2396
        if err != nil {
1✔
UNCOV
2397
                return err
×
UNCOV
2398
        }
×
2399

2400
        // update image
2401
        image, err := gpuv1.ImagePath(&config.NodeStatusExporter)
1✔
2402
        if err != nil {
2✔
2403
                return err
1✔
2404
        }
1✔
2405
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
2406

1✔
2407
        // update image pull policy
1✔
2408
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.NodeStatusExporter.ImagePullPolicy)
1✔
2409

1✔
2410
        // set image pull secrets
1✔
2411
        if len(config.NodeStatusExporter.ImagePullSecrets) > 0 {
1✔
UNCOV
2412
                addPullSecrets(&obj.Spec.Template.Spec, config.NodeStatusExporter.ImagePullSecrets)
×
UNCOV
2413
        }
×
2414

2415
        // set resource limits
2416
        if config.NodeStatusExporter.Resources != nil {
1✔
UNCOV
2417
                // apply resource limits to all containers
×
UNCOV
2418
                for i := range obj.Spec.Template.Spec.Containers {
×
UNCOV
2419
                        obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.NodeStatusExporter.Resources.Requests
×
UNCOV
2420
                        obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.NodeStatusExporter.Resources.Limits
×
UNCOV
2421
                }
×
2422
        }
2423

2424
        // set arguments if specified for driver container
2425
        if len(config.NodeStatusExporter.Args) > 0 {
1✔
UNCOV
2426
                obj.Spec.Template.Spec.Containers[0].Args = config.NodeStatusExporter.Args
×
UNCOV
2427
        }
×
2428

2429
        // set/append environment variables for exporter container
2430
        if len(config.NodeStatusExporter.Env) > 0 {
1✔
UNCOV
2431
                for _, env := range config.NodeStatusExporter.Env {
×
UNCOV
2432
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
×
UNCOV
2433
                }
×
2434
        }
2435

2436
        // update the security context for the node status exporter container.
2437
        transformValidatorSecurityContext(&obj.Spec.Template.Spec.Containers[0])
1✔
2438

1✔
2439
        return nil
1✔
2440
}
2441

2442
// get runtime(docker, containerd) config file path based on toolkit container env or default
2443
func getRuntimeConfigFile(c *corev1.Container, runtime string) (string, error) {
1✔
2444
        var runtimeConfigFile string
1✔
2445
        switch runtime {
1✔
UNCOV
2446
        case gpuv1.Docker.String():
×
UNCOV
2447
                runtimeConfigFile = DefaultDockerConfigFile
×
UNCOV
2448
                if value := getContainerEnv(c, "DOCKER_CONFIG"); value != "" {
×
UNCOV
2449
                        runtimeConfigFile = value
×
UNCOV
2450
                }
×
2451
        case gpuv1.Containerd.String():
1✔
2452
                runtimeConfigFile = DefaultContainerdConfigFile
1✔
2453
                if value := getContainerEnv(c, "CONTAINERD_CONFIG"); value != "" {
1✔
UNCOV
2454
                        runtimeConfigFile = value
×
2455
                }
×
2456
        case gpuv1.CRIO.String():
1✔
2457
                runtimeConfigFile = DefaultCRIOConfigFile
1✔
2458
                if value := getContainerEnv(c, "CRIO_CONFIG"); value != "" {
1✔
UNCOV
2459
                        runtimeConfigFile = value
×
UNCOV
2460
                }
×
UNCOV
2461
        default:
×
UNCOV
2462
                return "", fmt.Errorf("invalid runtime: %s", runtime)
×
2463
        }
2464

2465
        return runtimeConfigFile, nil
1✔
2466
}
2467

2468
// get runtime(docker, containerd) socket file path based on toolkit container env or default
2469
func getRuntimeSocketFile(c *corev1.Container, runtime string) (string, error) {
1✔
2470
        var runtimeSocketFile string
1✔
2471
        switch runtime {
1✔
UNCOV
2472
        case gpuv1.Docker.String():
×
UNCOV
2473
                runtimeSocketFile = DefaultDockerSocketFile
×
UNCOV
2474
                if getContainerEnv(c, "DOCKER_SOCKET") != "" {
×
2475
                        runtimeSocketFile = getContainerEnv(c, "DOCKER_SOCKET")
×
2476
                }
×
2477
        case gpuv1.Containerd.String():
1✔
2478
                runtimeSocketFile = DefaultContainerdSocketFile
1✔
2479
                if getContainerEnv(c, "CONTAINERD_SOCKET") != "" {
1✔
UNCOV
2480
                        runtimeSocketFile = getContainerEnv(c, "CONTAINERD_SOCKET")
×
UNCOV
2481
                }
×
2482
        case gpuv1.CRIO.String():
1✔
2483
                runtimeSocketFile = ""
1✔
2484
        default:
×
2485
                return "", fmt.Errorf("invalid runtime: %s", runtime)
×
2486
        }
2487

2488
        return runtimeSocketFile, nil
1✔
2489
}
2490

2491
func getContainerEnv(c *corev1.Container, key string) string {
1✔
2492
        for _, val := range c.Env {
2✔
2493
                if val.Name == key {
1✔
UNCOV
2494
                        return val.Value
×
UNCOV
2495
                }
×
2496
        }
2497
        return ""
1✔
2498
}
2499

2500
func setContainerEnv(c *corev1.Container, key, value string) {
1✔
2501
        for i, val := range c.Env {
2✔
2502
                if val.Name != key {
2✔
2503
                        continue
1✔
2504
                }
2505

2506
                c.Env[i].Value = value
1✔
2507
                return
1✔
2508
        }
2509
        c.Env = append(c.Env, corev1.EnvVar{Name: key, Value: value})
1✔
2510
}
2511

2512
func getRuntimeClass(config *gpuv1.ClusterPolicySpec) string {
1✔
2513
        if config.Operator.RuntimeClass != "" {
2✔
2514
                return config.Operator.RuntimeClass
1✔
2515
        }
1✔
2516
        return DefaultRuntimeClass
1✔
2517
}
2518

2519
func setRuntimeClass(podSpec *corev1.PodSpec, runtime gpuv1.Runtime, runtimeClass string) {
1✔
2520
        if runtime == gpuv1.Containerd {
2✔
2521
                if runtimeClass == "" {
2✔
2522
                        runtimeClass = DefaultRuntimeClass
1✔
2523
                }
1✔
2524
                podSpec.RuntimeClassName = &runtimeClass
1✔
2525
        }
2526
}
2527

2528
func setContainerProbe(container *corev1.Container, probe *gpuv1.ContainerProbeSpec, probeType ContainerProbe) {
1✔
2529
        var containerProbe *corev1.Probe
1✔
2530

1✔
2531
        // determine probe type to update
1✔
2532
        switch probeType {
1✔
2533
        case Startup:
1✔
2534
                containerProbe = container.StartupProbe
1✔
UNCOV
2535
        case Liveness:
×
UNCOV
2536
                containerProbe = container.LivenessProbe
×
UNCOV
2537
        case Readiness:
×
2538
                containerProbe = container.ReadinessProbe
×
2539
        }
2540

2541
        // set probe parameters if specified
2542
        if probe.InitialDelaySeconds != 0 {
2✔
2543
                containerProbe.InitialDelaySeconds = probe.InitialDelaySeconds
1✔
2544
        }
1✔
2545
        if probe.TimeoutSeconds != 0 {
2✔
2546
                containerProbe.TimeoutSeconds = probe.TimeoutSeconds
1✔
2547
        }
1✔
2548
        if probe.FailureThreshold != 0 {
2✔
2549
                containerProbe.FailureThreshold = probe.FailureThreshold
1✔
2550
        }
1✔
2551
        if probe.SuccessThreshold != 0 {
1✔
2552
                containerProbe.SuccessThreshold = probe.SuccessThreshold
×
2553
        }
×
2554
        if probe.PeriodSeconds != 0 {
2✔
2555
                containerProbe.PeriodSeconds = probe.PeriodSeconds
1✔
2556
        }
1✔
2557
}
2558

2559
// applies MIG related configuration env to container spec
2560
func applyMIGConfiguration(c *corev1.Container, strategy gpuv1.MIGStrategy) {
1✔
2561
        // if not set then let plugin decide this per node(default: none)
1✔
2562
        if strategy == "" {
2✔
2563
                setContainerEnv(c, "NVIDIA_MIG_MONITOR_DEVICES", "all")
1✔
2564
                return
1✔
2565
        }
1✔
2566

2567
        setContainerEnv(c, "MIG_STRATEGY", string(strategy))
1✔
2568
        if strategy != gpuv1.MIGStrategyNone {
2✔
2569
                setContainerEnv(c, "NVIDIA_MIG_MONITOR_DEVICES", "all")
1✔
2570
        }
1✔
2571
}
2572

2573
// checks if custom plugin config is provided through a ConfigMap
2574
func isCustomPluginConfigSet(pluginConfig *gpuv1.DevicePluginConfig) bool {
1✔
2575
        if pluginConfig != nil && pluginConfig.Name != "" {
2✔
2576
                return true
1✔
2577
        }
1✔
2578
        return false
1✔
2579
}
2580

2581
// adds shared volume mounts required for custom plugin config provided via a ConfigMap
2582
func addSharedMountsForPluginConfig(container *corev1.Container, config *gpuv1.DevicePluginConfig) {
1✔
2583
        emptyDirMount := corev1.VolumeMount{Name: "config", MountPath: "/config"}
1✔
2584
        configVolMount := corev1.VolumeMount{Name: config.Name, MountPath: "/available-configs"}
1✔
2585

1✔
2586
        container.VolumeMounts = append(container.VolumeMounts, emptyDirMount)
1✔
2587
        container.VolumeMounts = append(container.VolumeMounts, configVolMount)
1✔
2588
}
1✔
2589

2590
// apply spec changes to make custom configurations provided via a ConfigMap available to all containers
2591
func handleDevicePluginConfig(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) error {
1✔
2592
        if !isCustomPluginConfigSet(config.DevicePlugin.Config) {
2✔
2593
                // remove config-manager-init container
1✔
2594
                for i, initContainer := range obj.Spec.Template.Spec.InitContainers {
2✔
2595
                        if initContainer.Name != "config-manager-init" {
2✔
2596
                                continue
1✔
2597
                        }
2598
                        obj.Spec.Template.Spec.InitContainers = append(obj.Spec.Template.Spec.InitContainers[:i], obj.Spec.Template.Spec.InitContainers[i+1:]...)
1✔
2599
                }
2600
                // remove config-manager sidecar container
2601
                for i, container := range obj.Spec.Template.Spec.Containers {
2✔
2602
                        if container.Name != "config-manager" {
2✔
2603
                                continue
1✔
2604
                        }
2605
                        obj.Spec.Template.Spec.Containers = append(obj.Spec.Template.Spec.Containers[:i], obj.Spec.Template.Spec.Containers[i+1:]...)
1✔
2606
                }
2607
                return nil
1✔
2608
        }
2609

2610
        // Apply custom configuration provided through ConfigMap
2611
        // setup env for main container
2612
        for i, container := range obj.Spec.Template.Spec.Containers {
2✔
2613
                switch container.Name {
1✔
2614
                case "nvidia-device-plugin":
1✔
UNCOV
2615
                case "gpu-feature-discovery":
×
UNCOV
2616
                case "mps-control-daemon-ctr":
×
2617
                default:
1✔
2618
                        // skip if not the main container
1✔
2619
                        continue
1✔
2620
                }
2621
                setContainerEnv(&obj.Spec.Template.Spec.Containers[i], "CONFIG_FILE", "/config/config.yaml")
1✔
2622
                // setup sharedvolume(emptydir) for main container
1✔
2623
                addSharedMountsForPluginConfig(&obj.Spec.Template.Spec.Containers[i], config.DevicePlugin.Config)
1✔
2624
        }
2625

2626
        // if hostPID is already set, we skip setting the shareProcessNamespace field
2627
        // for context, go to https://github.com/kubernetes-client/go/blob/master/kubernetes/docs/V1PodSpec.md
2628
        if !obj.Spec.Template.Spec.HostPID {
2✔
2629
                // Enable process ns sharing for PID access
1✔
2630
                shareProcessNamespace := true
1✔
2631
                obj.Spec.Template.Spec.ShareProcessNamespace = &shareProcessNamespace
1✔
2632
        }
1✔
2633
        // setup volumes from configmap and shared emptyDir
2634
        obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, createConfigMapVolume(config.DevicePlugin.Config.Name, nil))
1✔
2635
        obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, createEmptyDirVolume("config"))
1✔
2636

1✔
2637
        // apply env/volume changes to initContainer
1✔
2638
        err := transformConfigManagerInitContainer(obj, config)
1✔
2639
        if err != nil {
1✔
UNCOV
2640
                return err
×
UNCOV
2641
        }
×
2642
        // apply env/volume changes to sidecarContainer
2643
        err = transformConfigManagerSidecarContainer(obj, config)
1✔
2644
        if err != nil {
1✔
UNCOV
2645
                return err
×
UNCOV
2646
        }
×
2647
        return nil
1✔
2648
}
2649

2650
func transformConfigManagerInitContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) error {
1✔
2651
        var initContainer *corev1.Container
1✔
2652
        for i := range obj.Spec.Template.Spec.InitContainers {
2✔
2653
                if obj.Spec.Template.Spec.InitContainers[i].Name != "config-manager-init" {
2✔
2654
                        continue
1✔
2655
                }
2656
                initContainer = &obj.Spec.Template.Spec.InitContainers[i]
1✔
2657
        }
2658
        if initContainer == nil {
1✔
UNCOV
2659
                // config-manager-init container is not added to the spec, this is a no-op
×
UNCOV
2660
                return nil
×
UNCOV
2661
        }
×
2662
        configManagerImage, err := gpuv1.ImagePath(&config.DevicePlugin)
1✔
2663
        if err != nil {
1✔
UNCOV
2664
                return err
×
UNCOV
2665
        }
×
2666
        initContainer.Image = configManagerImage
1✔
2667
        if config.DevicePlugin.ImagePullPolicy != "" {
1✔
UNCOV
2668
                initContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.DevicePlugin.ImagePullPolicy)
×
UNCOV
2669
        }
×
2670
        // setup env
2671
        setContainerEnv(initContainer, "DEFAULT_CONFIG", config.DevicePlugin.Config.Default)
1✔
2672
        setContainerEnv(initContainer, "FALLBACK_STRATEGIES", "empty")
1✔
2673

1✔
2674
        // setup volume mounts
1✔
2675
        addSharedMountsForPluginConfig(initContainer, config.DevicePlugin.Config)
1✔
2676
        return nil
1✔
2677
}
2678

2679
func transformConfigManagerSidecarContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) error {
1✔
2680
        var container *corev1.Container
1✔
2681
        for i := range obj.Spec.Template.Spec.Containers {
2✔
2682
                if obj.Spec.Template.Spec.Containers[i].Name != "config-manager" {
2✔
2683
                        continue
1✔
2684
                }
2685
                container = &obj.Spec.Template.Spec.Containers[i]
1✔
2686
        }
2687
        if container == nil {
1✔
UNCOV
2688
                // config-manager-init container is not added to the spec, this is a no-op
×
UNCOV
2689
                return nil
×
UNCOV
2690
        }
×
2691
        configManagerImage, err := gpuv1.ImagePath(&config.DevicePlugin)
1✔
2692
        if err != nil {
1✔
UNCOV
2693
                return err
×
UNCOV
2694
        }
×
2695
        container.Image = configManagerImage
1✔
2696
        if config.DevicePlugin.ImagePullPolicy != "" {
1✔
UNCOV
2697
                container.ImagePullPolicy = gpuv1.ImagePullPolicy(config.DevicePlugin.ImagePullPolicy)
×
2698
        }
×
2699
        // setup env
2700
        setContainerEnv(container, "DEFAULT_CONFIG", config.DevicePlugin.Config.Default)
1✔
2701
        setContainerEnv(container, "FALLBACK_STRATEGIES", "empty")
1✔
2702

1✔
2703
        // setup volume mounts
1✔
2704
        addSharedMountsForPluginConfig(container, config.DevicePlugin.Config)
1✔
2705
        return nil
1✔
2706
}
2707

2708
func transformDriverManagerInitContainer(obj *appsv1.DaemonSet, driverManagerSpec *gpuv1.DriverManagerSpec, rdmaSpec *gpuv1.GPUDirectRDMASpec) error {
1✔
2709
        var container *corev1.Container
1✔
2710
        for i, initCtr := range obj.Spec.Template.Spec.InitContainers {
2✔
2711
                if initCtr.Name == "k8s-driver-manager" {
2✔
2712
                        container = &obj.Spec.Template.Spec.InitContainers[i]
1✔
2713
                        break
1✔
2714
                }
2715
        }
2716

2717
        if container == nil {
1✔
2718
                return fmt.Errorf("failed to find k8s-driver-manager initContainer in spec")
×
2719
        }
×
2720

2721
        managerImage, err := gpuv1.ImagePath(driverManagerSpec)
1✔
2722
        if err != nil {
1✔
2723
                return err
×
UNCOV
2724
        }
×
2725
        container.Image = managerImage
1✔
2726

1✔
2727
        if driverManagerSpec.ImagePullPolicy != "" {
2✔
2728
                container.ImagePullPolicy = gpuv1.ImagePullPolicy(driverManagerSpec.ImagePullPolicy)
1✔
2729
        }
1✔
2730

2731
        if rdmaSpec != nil && rdmaSpec.IsEnabled() {
2✔
2732
                setContainerEnv(container, GPUDirectRDMAEnabledEnvName, "true")
1✔
2733
                if rdmaSpec.IsHostMOFED() {
2✔
2734
                        setContainerEnv(container, UseHostMOFEDEnvName, "true")
1✔
2735
                }
1✔
2736
        }
2737

2738
        // set/append environment variables for driver-manager initContainer
2739
        if len(driverManagerSpec.Env) > 0 {
2✔
2740
                for _, env := range driverManagerSpec.Env {
2✔
2741
                        setContainerEnv(container, env.Name, env.Value)
1✔
2742
                }
1✔
2743
        }
2744

2745
        // add any pull secrets needed for driver-manager image
2746
        if len(driverManagerSpec.ImagePullSecrets) > 0 {
2✔
2747
                addPullSecrets(&obj.Spec.Template.Spec, driverManagerSpec.ImagePullSecrets)
1✔
2748
        }
1✔
2749

2750
        return nil
1✔
2751
}
2752

2753
func transformPeerMemoryContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
2754
        for i, container := range obj.Spec.Template.Spec.Containers {
2✔
2755
                // skip if not nvidia-peermem
1✔
2756
                if !strings.Contains(container.Name, "nvidia-peermem") {
2✔
2757
                        continue
1✔
2758
                }
2759
                if config.Driver.GPUDirectRDMA == nil || !config.Driver.GPUDirectRDMA.IsEnabled() {
2✔
2760
                        // remove nvidia-peermem sidecar container from driver Daemonset if RDMA is not enabled
1✔
2761
                        obj.Spec.Template.Spec.Containers = append(obj.Spec.Template.Spec.Containers[:i], obj.Spec.Template.Spec.Containers[i+1:]...)
1✔
2762
                        return nil
1✔
2763
                }
1✔
2764
                // update nvidia-peermem driver image and pull policy to be same as gpu-driver image
2765
                // as its installed as part of gpu-driver image
UNCOV
2766
                driverImage, err := resolveDriverTag(n, &config.Driver)
×
UNCOV
2767
                if err != nil {
×
UNCOV
2768
                        return err
×
UNCOV
2769
                }
×
UNCOV
2770
                if driverImage != "" {
×
UNCOV
2771
                        obj.Spec.Template.Spec.Containers[i].Image = driverImage
×
UNCOV
2772
                }
×
UNCOV
2773
                if config.Driver.ImagePullPolicy != "" {
×
UNCOV
2774
                        obj.Spec.Template.Spec.Containers[i].ImagePullPolicy = gpuv1.ImagePullPolicy(config.Driver.ImagePullPolicy)
×
UNCOV
2775
                }
×
2776
                if config.Driver.GPUDirectRDMA.UseHostMOFED != nil && *config.Driver.GPUDirectRDMA.UseHostMOFED {
×
2777
                        // set env indicating host-mofed is enabled
×
UNCOV
2778
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[i]), UseHostMOFEDEnvName, "true")
×
UNCOV
2779
                }
×
2780
                // mount any custom kernel module configuration parameters at /drivers
2781
                if config.Driver.KernelModuleConfig != nil && config.Driver.KernelModuleConfig.Name != "" {
×
2782
                        // note: transformDriverContainer() will have already created a Volume backed by the ConfigMap.
×
UNCOV
2783
                        // Only add a VolumeMount for nvidia-peermem-ctr.
×
UNCOV
2784
                        destinationDir := "/drivers"
×
UNCOV
2785
                        volumeMounts, _, err := createConfigMapVolumeMounts(n, config.Driver.KernelModuleConfig.Name, destinationDir)
×
UNCOV
2786
                        if err != nil {
×
UNCOV
2787
                                return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for kernel module configuration: %v", err)
×
UNCOV
2788
                        }
×
UNCOV
2789
                        obj.Spec.Template.Spec.Containers[i].VolumeMounts = append(obj.Spec.Template.Spec.Containers[i].VolumeMounts, volumeMounts...)
×
2790
                }
2791
        }
UNCOV
2792
        return nil
×
2793
}
2794

2795
// check if running with openshift and add an ENV VAR to the OCP DTK CTR
2796
func transformGDSContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
2797
        for i, container := range obj.Spec.Template.Spec.Containers {
2✔
2798
                // skip if not nvidia-fs
1✔
2799
                if !strings.Contains(container.Name, "nvidia-fs") {
2✔
2800
                        continue
1✔
2801
                }
2802
                if config.GPUDirectStorage == nil || !config.GPUDirectStorage.IsEnabled() {
2✔
2803
                        n.logger.Info("GPUDirect Storage is disabled")
1✔
2804
                        // remove nvidia-fs sidecar container from driver Daemonset if GDS is not enabled
1✔
2805
                        obj.Spec.Template.Spec.Containers = append(obj.Spec.Template.Spec.Containers[:i], obj.Spec.Template.Spec.Containers[i+1:]...)
1✔
2806
                        return nil
1✔
2807
                }
1✔
UNCOV
2808
                if config.Driver.UsePrecompiledDrivers() {
×
UNCOV
2809
                        return fmt.Errorf("GPUDirect Storage driver (nvidia-fs) is not supported along with pre-compiled NVIDIA drivers")
×
UNCOV
2810
                }
×
2811

UNCOV
2812
                gdsContainer := &obj.Spec.Template.Spec.Containers[i]
×
UNCOV
2813

×
UNCOV
2814
                // update nvidia-fs(sidecar) image and pull policy
×
UNCOV
2815
                gdsImage, err := resolveDriverTag(n, config.GPUDirectStorage)
×
UNCOV
2816
                if err != nil {
×
UNCOV
2817
                        return err
×
UNCOV
2818
                }
×
UNCOV
2819
                if gdsImage != "" {
×
UNCOV
2820
                        gdsContainer.Image = gdsImage
×
UNCOV
2821
                }
×
UNCOV
2822
                if config.GPUDirectStorage.ImagePullPolicy != "" {
×
UNCOV
2823
                        gdsContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.GPUDirectStorage.ImagePullPolicy)
×
2824
                }
×
2825

2826
                // set image pull secrets
2827
                if len(config.GPUDirectStorage.ImagePullSecrets) > 0 {
×
2828
                        addPullSecrets(&obj.Spec.Template.Spec, config.GPUDirectStorage.ImagePullSecrets)
×
2829
                }
×
2830

2831
                // set/append environment variables for GDS container
2832
                if len(config.GPUDirectStorage.Env) > 0 {
×
2833
                        for _, env := range config.GPUDirectStorage.Env {
×
2834
                                setContainerEnv(gdsContainer, env.Name, env.Value)
×
2835
                        }
×
2836
                }
2837

UNCOV
2838
                if config.Driver.RepoConfig != nil && config.Driver.RepoConfig.ConfigMapName != "" {
×
2839
                        // note: transformDriverContainer() will have already created a Volume backed by the ConfigMap.
×
2840
                        // Only add a VolumeMount for nvidia-fs-ctr.
×
2841
                        destinationDir, err := getRepoConfigPath()
×
2842
                        if err != nil {
×
2843
                                return fmt.Errorf("ERROR: failed to get destination directory for custom repo config: %w", err)
×
2844
                        }
×
2845
                        volumeMounts, _, err := createConfigMapVolumeMounts(n, config.Driver.RepoConfig.ConfigMapName, destinationDir)
×
2846
                        if err != nil {
×
2847
                                return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for custom package repo config: %w", err)
×
UNCOV
2848
                        }
×
UNCOV
2849
                        gdsContainer.VolumeMounts = append(gdsContainer.VolumeMounts, volumeMounts...)
×
2850
                }
2851

2852
                // set any custom ssl key/certificate configuration provided
UNCOV
2853
                if config.Driver.CertConfig != nil && config.Driver.CertConfig.Name != "" {
×
UNCOV
2854
                        destinationDir, err := getCertConfigPath()
×
UNCOV
2855
                        if err != nil {
×
UNCOV
2856
                                return fmt.Errorf("ERROR: failed to get destination directory for ssl key/cert config: %w", err)
×
UNCOV
2857
                        }
×
UNCOV
2858
                        volumeMounts, _, err := createConfigMapVolumeMounts(n, config.Driver.CertConfig.Name, destinationDir)
×
UNCOV
2859
                        if err != nil {
×
UNCOV
2860
                                return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for custom certs: %w", err)
×
UNCOV
2861
                        }
×
UNCOV
2862
                        gdsContainer.VolumeMounts = append(gdsContainer.VolumeMounts, volumeMounts...)
×
2863
                }
2864

2865
                // transform the nvidia-fs-ctr to use the openshift driver toolkit
2866
                // notify openshift driver toolkit container GDS is enabled
2867
                err = transformOpenShiftDriverToolkitContainer(obj, config, n, "nvidia-fs-ctr")
×
2868
                if err != nil {
×
UNCOV
2869
                        return fmt.Errorf("ERROR: failed to transform the Driver Toolkit Container: %s", err)
×
2870
                }
×
2871
        }
2872
        return nil
×
2873
}
2874

2875
func transformGDRCopyContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
2876
        for i, container := range obj.Spec.Template.Spec.Containers {
2✔
2877
                // skip if not nvidia-gdrcopy
1✔
2878
                if !strings.HasPrefix(container.Name, "nvidia-gdrcopy") {
2✔
2879
                        continue
1✔
2880
                }
2881
                if config.GDRCopy == nil || !config.GDRCopy.IsEnabled() {
2✔
2882
                        n.logger.Info("GDRCopy is disabled")
1✔
2883
                        // remove nvidia-gdrcopy sidecar container from driver Daemonset if gdrcopy is not enabled
1✔
2884
                        obj.Spec.Template.Spec.Containers = append(obj.Spec.Template.Spec.Containers[:i], obj.Spec.Template.Spec.Containers[i+1:]...)
1✔
2885
                        return nil
1✔
2886
                }
1✔
2887
                if config.Driver.UsePrecompiledDrivers() {
×
UNCOV
2888
                        return fmt.Errorf("GDRCopy is not supported along with pre-compiled NVIDIA drivers")
×
UNCOV
2889
                }
×
2890

2891
                gdrcopyContainer := &obj.Spec.Template.Spec.Containers[i]
×
2892

×
2893
                // update nvidia-gdrcopy image and pull policy
×
UNCOV
2894
                gdrcopyImage, err := resolveDriverTag(n, config.GDRCopy)
×
UNCOV
2895
                if err != nil {
×
2896
                        return err
×
2897
                }
×
2898
                if gdrcopyImage != "" {
×
2899
                        gdrcopyContainer.Image = gdrcopyImage
×
2900
                }
×
2901
                if config.GDRCopy.ImagePullPolicy != "" {
×
2902
                        gdrcopyContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.GDRCopy.ImagePullPolicy)
×
2903
                }
×
2904

2905
                // set image pull secrets
2906
                if len(config.GDRCopy.ImagePullSecrets) > 0 {
×
2907
                        addPullSecrets(&obj.Spec.Template.Spec, config.GDRCopy.ImagePullSecrets)
×
UNCOV
2908
                }
×
2909

2910
                // set/append environment variables for gdrcopy container
2911
                if len(config.GDRCopy.Env) > 0 {
×
2912
                        for _, env := range config.GDRCopy.Env {
×
2913
                                setContainerEnv(gdrcopyContainer, env.Name, env.Value)
×
2914
                        }
×
2915
                }
2916

2917
                if config.Driver.RepoConfig != nil && config.Driver.RepoConfig.ConfigMapName != "" {
×
2918
                        // note: transformDriverContainer() will have already created a Volume backed by the ConfigMap.
×
2919
                        // Only add a VolumeMount for nvidia-gdrcopy-ctr.
×
2920
                        destinationDir, err := getRepoConfigPath()
×
UNCOV
2921
                        if err != nil {
×
UNCOV
2922
                                return fmt.Errorf("ERROR: failed to get destination directory for custom repo config: %w", err)
×
UNCOV
2923
                        }
×
UNCOV
2924
                        volumeMounts, _, err := createConfigMapVolumeMounts(n, config.Driver.RepoConfig.ConfigMapName, destinationDir)
×
2925
                        if err != nil {
×
2926
                                return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for custom package repo config: %w", err)
×
2927
                        }
×
2928
                        gdrcopyContainer.VolumeMounts = append(gdrcopyContainer.VolumeMounts, volumeMounts...)
×
2929
                }
2930

2931
                // set any custom ssl key/certificate configuration provided
UNCOV
2932
                if config.Driver.CertConfig != nil && config.Driver.CertConfig.Name != "" {
×
UNCOV
2933
                        destinationDir, err := getCertConfigPath()
×
UNCOV
2934
                        if err != nil {
×
UNCOV
2935
                                return fmt.Errorf("ERROR: failed to get destination directory for ssl key/cert config: %w", err)
×
UNCOV
2936
                        }
×
UNCOV
2937
                        volumeMounts, _, err := createConfigMapVolumeMounts(n, config.Driver.CertConfig.Name, destinationDir)
×
UNCOV
2938
                        if err != nil {
×
UNCOV
2939
                                return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for custom certs: %w", err)
×
UNCOV
2940
                        }
×
UNCOV
2941
                        gdrcopyContainer.VolumeMounts = append(gdrcopyContainer.VolumeMounts, volumeMounts...)
×
2942
                }
2943

2944
                // transform the nvidia-gdrcopy-ctr to use the openshift driver toolkit
2945
                // notify openshift driver toolkit container that gdrcopy is enabled
2946
                err = transformOpenShiftDriverToolkitContainer(obj, config, n, "nvidia-gdrcopy-ctr")
×
2947
                if err != nil {
×
UNCOV
2948
                        return fmt.Errorf("ERROR: failed to transform the Driver Toolkit Container: %w", err)
×
2949
                }
×
2950
        }
2951
        return nil
×
2952
}
2953

2954
// getSanitizedKernelVersion returns kernelVersion with following changes
2955
// 1. Remove arch suffix (as we use multi-arch images) and
2956
// 2. ensure to meet k8s constraints for metadata.name, i.e it
2957
// must consist of lower case alphanumeric characters, '-' or '.', and must start and end with an alphanumeric character
2958
func getSanitizedKernelVersion(kernelVersion string) string {
1✔
2959
        archRegex := regexp.MustCompile("x86_64(?:_64k)?|aarch64(?:_64k)?")
1✔
2960
        // remove arch strings, "_" and any trailing "." from the kernel version
1✔
2961
        sanitizedVersion := strings.TrimSuffix(strings.ReplaceAll(archRegex.ReplaceAllString(kernelVersion, ""), "_", "."), ".")
1✔
2962
        return strings.ToLower(sanitizedVersion)
1✔
2963
}
1✔
2964

2965
func transformPrecompiledDriverDaemonset(obj *appsv1.DaemonSet, n ClusterPolicyController) (err error) {
1✔
2966
        sanitizedVersion := getSanitizedKernelVersion(n.currentKernelVersion)
1✔
2967
        // prepare the DaemonSet to be kernel-version specific
1✔
2968
        obj.Name += "-" + sanitizedVersion + "-" + n.kernelVersionMap[n.currentKernelVersion]
1✔
2969

1✔
2970
        // add unique labels for each kernel-version specific Daemonset
1✔
2971
        obj.Labels[precompiledIdentificationLabelKey] = precompiledIdentificationLabelValue
1✔
2972
        obj.Spec.Template.Labels[precompiledIdentificationLabelKey] = precompiledIdentificationLabelValue
1✔
2973

1✔
2974
        // append kernel-version specific node-selector
1✔
2975
        obj.Spec.Template.Spec.NodeSelector[nfdKernelLabelKey] = n.currentKernelVersion
1✔
2976
        return nil
1✔
2977
}
1✔
2978

2979
func transformOpenShiftDriverToolkitContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController, mainContainerName string) error {
1✔
2980
        var err error
1✔
2981

1✔
2982
        getContainer := func(name string, remove bool) (*corev1.Container, error) {
2✔
2983
                for i, container := range obj.Spec.Template.Spec.Containers {
2✔
2984
                        if container.Name != name {
2✔
2985
                                continue
1✔
2986
                        }
2987
                        if !remove {
1✔
UNCOV
2988
                                return &obj.Spec.Template.Spec.Containers[i], nil
×
UNCOV
2989
                        }
×
2990

2991
                        obj.Spec.Template.Spec.Containers = append(obj.Spec.Template.Spec.Containers[:i],
1✔
2992
                                obj.Spec.Template.Spec.Containers[i+1:]...)
1✔
2993
                        return nil, nil
1✔
2994
                }
2995

2996
                // if a container is not found, then it must have been removed already, return success
2997
                if remove {
×
2998
                        return nil, nil
×
2999
                }
×
3000

UNCOV
3001
                return nil, fmt.Errorf("could not find the '%s' container", name)
×
3002
        }
3003

3004
        if !n.ocpDriverToolkit.enabled {
2✔
3005
                if n.ocpDriverToolkit.requested {
1✔
3006
                        n.logger.Info("OpenShift DriverToolkit was requested but could not be enabled (dependencies missing)")
×
3007
                }
×
3008

3009
                /* remove OpenShift Driver Toolkit side-car container from the Driver DaemonSet */
3010
                _, err = getContainer("openshift-driver-toolkit-ctr", true)
1✔
3011
                return err
1✔
3012
        }
3013

3014
        /* find the main container and driver-toolkit sidecar container */
UNCOV
3015
        var mainContainer, driverToolkitContainer *corev1.Container
×
UNCOV
3016
        if mainContainer, err = getContainer(mainContainerName, false); err != nil {
×
UNCOV
3017
                return err
×
UNCOV
3018
        }
×
3019

UNCOV
3020
        if driverToolkitContainer, err = getContainer("openshift-driver-toolkit-ctr", false); err != nil {
×
UNCOV
3021
                return err
×
UNCOV
3022
        }
×
3023

3024
        /* prepare the DaemonSet to be RHCOS-version specific */
UNCOV
3025
        rhcosVersion := n.ocpDriverToolkit.currentRhcosVersion
×
UNCOV
3026

×
UNCOV
3027
        if !strings.Contains(obj.Name, rhcosVersion) {
×
UNCOV
3028
                obj.Name += "-" + rhcosVersion
×
UNCOV
3029
        }
×
UNCOV
3030
        obj.Labels["app"] = obj.Name
×
UNCOV
3031
        obj.Spec.Selector.MatchLabels["app"] = obj.Name
×
UNCOV
3032
        obj.Spec.Template.Labels["app"] = obj.Name
×
UNCOV
3033

×
UNCOV
3034
        obj.Labels[ocpDriverToolkitVersionLabel] = rhcosVersion
×
UNCOV
3035
        obj.Spec.Template.Spec.NodeSelector[nfdOSTreeVersionLabelKey] = rhcosVersion
×
UNCOV
3036

×
UNCOV
3037
        /* prepare the DaemonSet to be searchable */
×
UNCOV
3038
        obj.Labels[ocpDriverToolkitIdentificationLabel] = ocpDriverToolkitIdentificationValue
×
UNCOV
3039
        obj.Spec.Template.Labels[ocpDriverToolkitIdentificationLabel] = ocpDriverToolkitIdentificationValue
×
UNCOV
3040

×
UNCOV
3041
        /* prepare the DriverToolkit container */
×
UNCOV
3042
        setContainerEnv(driverToolkitContainer, "RHCOS_VERSION", rhcosVersion)
×
UNCOV
3043

×
UNCOV
3044
        if config.GPUDirectStorage != nil && config.GPUDirectStorage.IsEnabled() {
×
UNCOV
3045
                setContainerEnv(driverToolkitContainer, "GDS_ENABLED", "true")
×
3046
                n.logger.V(2).Info("transformOpenShiftDriverToolkitContainer", "GDS_ENABLED", config.GPUDirectStorage.IsEnabled())
×
3047
        }
×
3048

UNCOV
3049
        if config.GDRCopy != nil && config.GDRCopy.IsEnabled() {
×
UNCOV
3050
                setContainerEnv(driverToolkitContainer, "GDRCOPY_ENABLED", "true")
×
UNCOV
3051
                n.logger.V(2).Info("transformOpenShiftDriverToolkitContainer", "GDRCOPY_ENABLED", "true")
×
UNCOV
3052
        }
×
3053

UNCOV
3054
        image := n.ocpDriverToolkit.rhcosDriverToolkitImages[n.ocpDriverToolkit.currentRhcosVersion]
×
3055
        if image != "" {
×
3056
                driverToolkitContainer.Image = image
×
3057
                n.logger.Info("DriverToolkit", "image", driverToolkitContainer.Image)
×
UNCOV
3058
        } else {
×
3059
                /* RHCOS tag missing in the Driver-Toolkit imagestream, setup fallback */
×
UNCOV
3060
                obj.Labels["openshift.driver-toolkit.rhcos-image-missing"] = "true"
×
UNCOV
3061
                obj.Spec.Template.Labels["openshift.driver-toolkit.rhcos-image-missing"] = "true"
×
UNCOV
3062

×
UNCOV
3063
                driverToolkitContainer.Image = mainContainer.Image
×
3064
                setContainerEnv(mainContainer, "RHCOS_IMAGE_MISSING", "true")
×
3065
                setContainerEnv(mainContainer, "RHCOS_VERSION", rhcosVersion)
×
UNCOV
3066
                setContainerEnv(driverToolkitContainer, "RHCOS_IMAGE_MISSING", "true")
×
UNCOV
3067

×
UNCOV
3068
                n.logger.Info("WARNING: DriverToolkit image tag missing. Version-specific fallback mode enabled.", "rhcosVersion", rhcosVersion)
×
UNCOV
3069
        }
×
3070

3071
        /* prepare the main container to start from the DriverToolkit entrypoint */
UNCOV
3072
        switch mainContainerName {
×
3073
        case "nvidia-fs-ctr":
×
3074
                mainContainer.Command = []string{"ocp_dtk_entrypoint"}
×
3075
                mainContainer.Args = []string{"nv-fs-ctr-run-with-dtk"}
×
3076
        case "nvidia-gdrcopy-ctr":
×
UNCOV
3077
                mainContainer.Command = []string{"ocp_dtk_entrypoint"}
×
3078
                mainContainer.Args = []string{"gdrcopy-ctr-run-with-dtk"}
×
3079
        default:
×
3080
                mainContainer.Command = []string{"ocp_dtk_entrypoint"}
×
UNCOV
3081
                mainContainer.Args = []string{"nv-ctr-run-with-dtk"}
×
3082
        }
3083

3084
        /* prepare the shared volumes */
3085
        // shared directory
3086
        volSharedDirName, volSharedDirPath := "shared-nvidia-driver-toolkit", "/mnt/shared-nvidia-driver-toolkit"
×
3087

×
3088
        volMountSharedDir := corev1.VolumeMount{Name: volSharedDirName, MountPath: volSharedDirPath}
×
3089
        mainContainer.VolumeMounts = append(mainContainer.VolumeMounts, volMountSharedDir)
×
3090

×
3091
        volSharedDir := corev1.Volume{
×
3092
                Name: volSharedDirName,
×
3093
                VolumeSource: corev1.VolumeSource{
×
3094
                        EmptyDir: &corev1.EmptyDirVolumeSource{},
×
3095
                },
×
3096
        }
×
3097

×
3098
        // Check if the volume already exists, if not add it
×
3099
        for i := range obj.Spec.Template.Spec.Volumes {
×
3100
                if obj.Spec.Template.Spec.Volumes[i].Name == volSharedDirName {
×
3101
                        // already exists, avoid duplicated volume
×
3102
                        return nil
×
3103
                }
×
3104
        }
3105
        obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, volSharedDir)
×
UNCOV
3106
        return nil
×
3107
}
3108

3109
// resolveDriverTag resolves image tag based on the OS of the worker node
3110
func resolveDriverTag(n ClusterPolicyController, driverSpec interface{}) (string, error) {
1✔
3111
        // obtain os version
1✔
3112
        kvers, osTag, _ := kernelFullVersion(n)
1✔
3113
        if kvers == "" {
1✔
3114
                return "", fmt.Errorf("ERROR: Could not find kernel full version: ('%s', '%s')", kvers, osTag)
×
3115
        }
×
3116

3117
        // obtain image path
3118
        var image string
1✔
3119
        var err error
1✔
3120
        switch v := driverSpec.(type) {
1✔
3121
        case *gpuv1.DriverSpec:
1✔
3122
                spec := driverSpec.(*gpuv1.DriverSpec)
1✔
3123
                // check if this is pre-compiled driver deployment.
1✔
3124
                if spec.UsePrecompiledDrivers() {
2✔
3125
                        if spec.Repository == "" && spec.Version == "" {
1✔
3126
                                if spec.Image != "" {
×
3127
                                        // this is useful for tools like kbld(carvel) which will just specify driver.image param as path:version
×
UNCOV
3128
                                        image = spec.Image + "-" + n.currentKernelVersion
×
UNCOV
3129
                                } else {
×
3130
                                        return "", fmt.Errorf("unable to resolve driver image path for pre-compiled drivers, driver.repository, driver.image and driver.version have to be specified in the ClusterPolicy")
×
3131
                                }
×
3132
                        } else {
1✔
3133
                                // use per kernel version tag
1✔
3134
                                image = spec.Repository + "/" + spec.Image + ":" + spec.Version + "-" + n.currentKernelVersion
1✔
3135
                        }
1✔
3136
                } else {
1✔
3137
                        image, err = gpuv1.ImagePath(spec)
1✔
3138
                        if err != nil {
1✔
3139
                                return "", err
×
UNCOV
3140
                        }
×
3141
                }
UNCOV
3142
        case *gpuv1.GPUDirectStorageSpec:
×
UNCOV
3143
                spec := driverSpec.(*gpuv1.GPUDirectStorageSpec)
×
3144
                image, err = gpuv1.ImagePath(spec)
×
3145
                if err != nil {
×
3146
                        return "", err
×
3147
                }
×
3148
        case *gpuv1.VGPUManagerSpec:
1✔
3149
                spec := driverSpec.(*gpuv1.VGPUManagerSpec)
1✔
3150
                image, err = gpuv1.ImagePath(spec)
1✔
3151
                if err != nil {
1✔
3152
                        return "", err
×
3153
                }
×
3154
        case *gpuv1.GDRCopySpec:
×
3155
                spec := driverSpec.(*gpuv1.GDRCopySpec)
×
3156
                image, err = gpuv1.ImagePath(spec)
×
3157
                if err != nil {
×
3158
                        return "", err
×
3159
                }
×
3160
        default:
×
3161
                return "", fmt.Errorf("invalid type to construct image path: %v", v)
×
3162
        }
3163

3164
        // if image digest is specified, use it directly
3165
        if !strings.Contains(image, "sha256:") {
2✔
3166
                // append os-tag to the provided driver version
1✔
3167
                image = fmt.Sprintf("%s-%s", image, osTag)
1✔
3168
        }
1✔
3169
        return image, nil
1✔
3170
}
3171

3172
// getRepoConfigPath returns the standard OS specific path for repository configuration files
3173
func getRepoConfigPath() (string, error) {
×
UNCOV
3174
        release, err := parseOSRelease()
×
UNCOV
3175
        if err != nil {
×
UNCOV
3176
                return "", err
×
UNCOV
3177
        }
×
3178

UNCOV
3179
        os := release["ID"]
×
UNCOV
3180
        if path, ok := RepoConfigPathMap[os]; ok {
×
UNCOV
3181
                return path, nil
×
UNCOV
3182
        }
×
UNCOV
3183
        return "", fmt.Errorf("distribution not supported")
×
3184
}
3185

3186
// getCertConfigPath returns the standard OS specific path for ssl keys/certificates
3187
func getCertConfigPath() (string, error) {
×
3188
        release, err := parseOSRelease()
×
3189
        if err != nil {
×
UNCOV
3190
                return "", err
×
UNCOV
3191
        }
×
3192

UNCOV
3193
        os := release["ID"]
×
UNCOV
3194
        if path, ok := CertConfigPathMap[os]; ok {
×
UNCOV
3195
                return path, nil
×
UNCOV
3196
        }
×
3197
        return "", fmt.Errorf("distribution not supported")
×
3198
}
3199

3200
// getSubscriptionPathsToVolumeSources returns the MountPathToVolumeSource map containing all
3201
// OS-specific subscription/entitlement paths that need to be mounted in the container.
3202
func getSubscriptionPathsToVolumeSources() (MountPathToVolumeSource, error) {
×
3203
        release, err := parseOSRelease()
×
3204
        if err != nil {
×
3205
                return nil, err
×
UNCOV
3206
        }
×
3207

UNCOV
3208
        os := release["ID"]
×
UNCOV
3209
        if pathToVolumeSource, ok := SubscriptionPathMap[os]; ok {
×
3210
                return pathToVolumeSource, nil
×
3211
        }
×
3212
        return nil, fmt.Errorf("distribution not supported")
×
3213
}
3214

3215
// createConfigMapVolumeMounts creates a VolumeMount for each key
3216
// in the ConfigMap. Use subPath to ensure original contents
3217
// at destinationDir are not overwritten.
3218
func createConfigMapVolumeMounts(n ClusterPolicyController, configMapName string, destinationDir string) ([]corev1.VolumeMount, []corev1.KeyToPath, error) {
×
3219
        ctx := n.ctx
×
UNCOV
3220
        // get the ConfigMap
×
UNCOV
3221
        cm := &corev1.ConfigMap{}
×
UNCOV
3222
        opts := client.ObjectKey{Namespace: n.operatorNamespace, Name: configMapName}
×
UNCOV
3223
        err := n.client.Get(ctx, opts, cm)
×
UNCOV
3224
        if err != nil {
×
UNCOV
3225
                return nil, nil, fmt.Errorf("ERROR: could not get ConfigMap %s from client: %v", configMapName, err)
×
UNCOV
3226
        }
×
3227

3228
        // create one volume mount per file in the ConfigMap and use subPath
UNCOV
3229
        var filenames []string
×
UNCOV
3230
        for filename := range cm.Data {
×
3231
                filenames = append(filenames, filename)
×
3232
        }
×
3233
        // sort so volume mounts are added to spec in deterministic order
3234
        sort.Strings(filenames)
×
3235
        var itemsToInclude []corev1.KeyToPath
×
UNCOV
3236
        var volumeMounts []corev1.VolumeMount
×
3237
        for _, filename := range filenames {
×
3238
                volumeMounts = append(volumeMounts,
×
3239
                        corev1.VolumeMount{Name: configMapName, ReadOnly: true, MountPath: filepath.Join(destinationDir, filename), SubPath: filename})
×
3240
                itemsToInclude = append(itemsToInclude, corev1.KeyToPath{
×
3241
                        Key:  filename,
×
UNCOV
3242
                        Path: filename,
×
UNCOV
3243
                })
×
UNCOV
3244
        }
×
3245
        return volumeMounts, itemsToInclude, nil
×
3246
}
3247

3248
func createConfigMapVolume(configMapName string, itemsToInclude []corev1.KeyToPath) corev1.Volume {
1✔
3249
        volumeSource := corev1.VolumeSource{
1✔
3250
                ConfigMap: &corev1.ConfigMapVolumeSource{
1✔
3251
                        LocalObjectReference: corev1.LocalObjectReference{
1✔
3252
                                Name: configMapName,
1✔
3253
                        },
1✔
3254
                        Items: itemsToInclude,
1✔
3255
                },
1✔
3256
        }
1✔
3257
        return corev1.Volume{Name: configMapName, VolumeSource: volumeSource}
1✔
3258
}
1✔
3259

3260
func createEmptyDirVolume(volumeName string) corev1.Volume {
1✔
3261
        return corev1.Volume{
1✔
3262
                Name: volumeName,
1✔
3263
                VolumeSource: corev1.VolumeSource{
1✔
3264
                        EmptyDir: &corev1.EmptyDirVolumeSource{},
1✔
3265
                },
1✔
3266
        }
1✔
3267
}
1✔
3268

3269
func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
3270
        driverIndex := 0
1✔
3271
        driverCtrFound := false
1✔
3272

1✔
3273
        podSpec := &obj.Spec.Template.Spec
1✔
3274
        for i, container := range podSpec.Containers {
2✔
3275
                // check if this is the main nvidia-driver container
1✔
3276
                if container.Name == "nvidia-driver-ctr" {
2✔
3277
                        driverIndex = i
1✔
3278
                        driverCtrFound = true
1✔
3279
                        break
1✔
3280
                }
3281
        }
3282

3283
        if !driverCtrFound {
1✔
3284
                return fmt.Errorf("driver container (nvidia-driver-ctr) is missing from the driver daemonset manifest")
×
UNCOV
3285
        }
×
3286

3287
        driverContainer := &podSpec.Containers[driverIndex]
1✔
3288

1✔
3289
        image, err := resolveDriverTag(n, &config.Driver)
1✔
3290
        if err != nil {
1✔
UNCOV
3291
                return err
×
3292
        }
×
3293
        if image != "" {
2✔
3294
                driverContainer.Image = image
1✔
3295
        }
1✔
3296

3297
        // update image pull policy
3298
        driverContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.Driver.ImagePullPolicy)
1✔
3299

1✔
3300
        // set image pull secrets
1✔
3301
        if len(config.Driver.ImagePullSecrets) > 0 {
2✔
3302
                addPullSecrets(&obj.Spec.Template.Spec, config.Driver.ImagePullSecrets)
1✔
3303
        }
1✔
3304
        // set resource limits
3305
        if config.Driver.Resources != nil {
1✔
UNCOV
3306
                driverContainer.Resources.Requests = config.Driver.Resources.Requests
×
UNCOV
3307
                driverContainer.Resources.Limits = config.Driver.Resources.Limits
×
UNCOV
3308
        }
×
3309
        // set arguments if specified for driver container
3310
        if len(config.Driver.Args) > 0 {
1✔
UNCOV
3311
                driverContainer.Args = config.Driver.Args
×
UNCOV
3312
        }
×
3313

3314
        if len(config.Driver.KernelModuleType) > 0 {
1✔
UNCOV
3315
                setContainerEnv(driverContainer, KernelModuleTypeEnvName, config.Driver.KernelModuleType)
×
UNCOV
3316
                // we set the "OPEN_KERNEL_MODULES_ENABLED" envar for backwards compatibility with older driver containers
×
UNCOV
3317
                if config.Driver.OpenKernelModulesEnabled() {
×
UNCOV
3318
                        setContainerEnv(driverContainer, OpenKernelModulesEnabledEnvName, "true")
×
UNCOV
3319
                }
×
3320
        }
3321

3322
        // set container probe timeouts
3323
        if config.Driver.StartupProbe != nil {
2✔
3324
                setContainerProbe(driverContainer, config.Driver.StartupProbe, Startup)
1✔
3325
        }
1✔
3326
        if config.Driver.LivenessProbe != nil {
1✔
UNCOV
3327
                setContainerProbe(driverContainer, config.Driver.LivenessProbe, Liveness)
×
UNCOV
3328
        }
×
3329
        if config.Driver.ReadinessProbe != nil {
1✔
UNCOV
3330
                setContainerProbe(driverContainer, config.Driver.ReadinessProbe, Readiness)
×
UNCOV
3331
        }
×
3332

3333
        if config.Driver.GPUDirectRDMA != nil && config.Driver.GPUDirectRDMA.IsEnabled() {
1✔
UNCOV
3334
                // set env indicating nvidia-peermem is enabled to compile module with required ib_* interfaces
×
UNCOV
3335
                setContainerEnv(driverContainer, GPUDirectRDMAEnabledEnvName, "true")
×
UNCOV
3336
                // check if MOFED drives are directly installed on host and update source path accordingly
×
UNCOV
3337
                // to build nvidia-peermem module
×
UNCOV
3338
                if config.Driver.GPUDirectRDMA.UseHostMOFED != nil && *config.Driver.GPUDirectRDMA.UseHostMOFED {
×
UNCOV
3339
                        // mount /usr/src/ofa_kernel path directly from host to build using MOFED drivers installed on host
×
UNCOV
3340
                        for index, volume := range podSpec.Volumes {
×
UNCOV
3341
                                if volume.Name == "mlnx-ofed-usr-src" {
×
3342
                                        podSpec.Volumes[index].HostPath.Path = "/usr/src"
×
3343
                                }
×
3344
                        }
3345
                        // set env indicating host-mofed is enabled
UNCOV
3346
                        setContainerEnv(driverContainer, UseHostMOFEDEnvName, "true")
×
3347
                }
3348
        }
3349

3350
        // set any licensing configuration required
3351
        if config.Driver.LicensingConfig != nil && config.Driver.LicensingConfig.ConfigMapName != "" {
1✔
UNCOV
3352
                licensingConfigVolMount := corev1.VolumeMount{Name: "licensing-config", ReadOnly: true, MountPath: VGPULicensingConfigMountPath, SubPath: VGPULicensingFileName}
×
UNCOV
3353
                driverContainer.VolumeMounts = append(driverContainer.VolumeMounts, licensingConfigVolMount)
×
UNCOV
3354

×
UNCOV
3355
                // gridd.conf always mounted
×
UNCOV
3356
                licenseItemsToInclude := []corev1.KeyToPath{
×
UNCOV
3357
                        {
×
UNCOV
3358
                                Key:  VGPULicensingFileName,
×
UNCOV
3359
                                Path: VGPULicensingFileName,
×
UNCOV
3360
                        },
×
UNCOV
3361
                }
×
UNCOV
3362
                // client config token only mounted when NLS is enabled
×
UNCOV
3363
                if config.Driver.LicensingConfig.IsNLSEnabled() {
×
3364
                        licenseItemsToInclude = append(licenseItemsToInclude, corev1.KeyToPath{
×
3365
                                Key:  NLSClientTokenFileName,
×
3366
                                Path: NLSClientTokenFileName,
×
UNCOV
3367
                        })
×
UNCOV
3368
                        nlsTokenVolMount := corev1.VolumeMount{Name: "licensing-config", ReadOnly: true, MountPath: NLSClientTokenMountPath, SubPath: NLSClientTokenFileName}
×
3369
                        driverContainer.VolumeMounts = append(driverContainer.VolumeMounts, nlsTokenVolMount)
×
3370
                }
×
3371

UNCOV
3372
                licensingConfigVolumeSource := corev1.VolumeSource{
×
3373
                        ConfigMap: &corev1.ConfigMapVolumeSource{
×
3374
                                LocalObjectReference: corev1.LocalObjectReference{
×
3375
                                        Name: config.Driver.LicensingConfig.ConfigMapName,
×
3376
                                },
×
3377
                                Items: licenseItemsToInclude,
×
UNCOV
3378
                        },
×
UNCOV
3379
                }
×
UNCOV
3380
                licensingConfigVol := corev1.Volume{Name: "licensing-config", VolumeSource: licensingConfigVolumeSource}
×
UNCOV
3381
                podSpec.Volumes = append(podSpec.Volumes, licensingConfigVol)
×
3382
        }
3383

3384
        // set virtual topology daemon configuration if specified for vGPU driver
3385
        if config.Driver.VirtualTopology != nil && config.Driver.VirtualTopology.Config != "" {
1✔
3386
                topologyConfigVolMount := corev1.VolumeMount{Name: "topology-config", ReadOnly: true, MountPath: VGPUTopologyConfigMountPath, SubPath: VGPUTopologyConfigFileName}
×
UNCOV
3387
                driverContainer.VolumeMounts = append(driverContainer.VolumeMounts, topologyConfigVolMount)
×
3388

×
3389
                topologyConfigVolumeSource := corev1.VolumeSource{
×
UNCOV
3390
                        ConfigMap: &corev1.ConfigMapVolumeSource{
×
UNCOV
3391
                                LocalObjectReference: corev1.LocalObjectReference{
×
3392
                                        Name: config.Driver.VirtualTopology.Config,
×
3393
                                },
×
3394
                                Items: []corev1.KeyToPath{
×
3395
                                        {
×
3396
                                                Key:  VGPUTopologyConfigFileName,
×
3397
                                                Path: VGPUTopologyConfigFileName,
×
3398
                                        },
×
3399
                                },
×
3400
                        },
×
3401
                }
×
UNCOV
3402
                topologyConfigVol := corev1.Volume{Name: "topology-config", VolumeSource: topologyConfigVolumeSource}
×
UNCOV
3403
                podSpec.Volumes = append(podSpec.Volumes, topologyConfigVol)
×
3404
        }
×
3405

3406
        // mount any custom kernel module configuration parameters at /drivers
3407
        if config.Driver.KernelModuleConfig != nil && config.Driver.KernelModuleConfig.Name != "" {
1✔
UNCOV
3408
                destinationDir := "/drivers"
×
UNCOV
3409
                volumeMounts, itemsToInclude, err := createConfigMapVolumeMounts(n, config.Driver.KernelModuleConfig.Name, destinationDir)
×
3410
                if err != nil {
×
3411
                        return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for kernel module configuration: %v", err)
×
3412
                }
×
3413
                driverContainer.VolumeMounts = append(driverContainer.VolumeMounts, volumeMounts...)
×
3414
                podSpec.Volumes = append(podSpec.Volumes, createConfigMapVolume(config.Driver.KernelModuleConfig.Name, itemsToInclude))
×
3415
        }
3416

3417
        if len(config.Driver.Env) > 0 {
1✔
3418
                for _, env := range config.Driver.Env {
×
3419
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
×
3420
                }
×
3421
        }
3422

3423
        // no further repo configuration required when using pre-compiled drivers, return here.
3424
        if config.Driver.UsePrecompiledDrivers() {
2✔
3425
                return nil
1✔
3426
        }
1✔
3427

3428
        // set any custom repo configuration provided when using runfile based driver installation
3429
        if config.Driver.RepoConfig != nil && config.Driver.RepoConfig.ConfigMapName != "" {
1✔
3430
                destinationDir, err := getRepoConfigPath()
×
3431
                if err != nil {
×
3432
                        return fmt.Errorf("ERROR: failed to get destination directory for custom repo config: %v", err)
×
3433
                }
×
3434
                volumeMounts, itemsToInclude, err := createConfigMapVolumeMounts(n, config.Driver.RepoConfig.ConfigMapName, destinationDir)
×
3435
                if err != nil {
×
3436
                        return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for custom repo config: %v", err)
×
3437
                }
×
3438
                driverContainer.VolumeMounts = append(driverContainer.VolumeMounts, volumeMounts...)
×
3439
                podSpec.Volumes = append(podSpec.Volumes, createConfigMapVolume(config.Driver.RepoConfig.ConfigMapName, itemsToInclude))
×
3440
        }
3441

3442
        // set any custom ssl key/certificate configuration provided
3443
        if config.Driver.CertConfig != nil && config.Driver.CertConfig.Name != "" {
1✔
3444
                destinationDir, err := getCertConfigPath()
×
3445
                if err != nil {
×
3446
                        return fmt.Errorf("ERROR: failed to get destination directory for custom repo config: %v", err)
×
3447
                }
×
3448
                volumeMounts, itemsToInclude, err := createConfigMapVolumeMounts(n, config.Driver.CertConfig.Name, destinationDir)
×
3449
                if err != nil {
×
3450
                        return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for custom certs: %v", err)
×
3451
                }
×
3452
                driverContainer.VolumeMounts = append(driverContainer.VolumeMounts, volumeMounts...)
×
3453
                podSpec.Volumes = append(podSpec.Volumes, createConfigMapVolume(config.Driver.CertConfig.Name, itemsToInclude))
×
3454
        }
3455

3456
        release, err := parseOSRelease()
1✔
3457
        if err != nil {
1✔
3458
                return fmt.Errorf("ERROR: failed to get os-release: %s", err)
×
3459
        }
×
3460

3461
        // set up subscription entitlements for RHEL(using K8s with a non-CRIO runtime) and SLES
3462
        if (release["ID"] == "rhel" && n.openshift == "" && n.runtime != gpuv1.CRIO) || release["ID"] == "sles" || release["ID"] == "sl-micro" {
1✔
UNCOV
3463
                n.logger.Info("Mounting subscriptions into the driver container", "OS", release["ID"])
×
UNCOV
3464
                pathToVolumeSource, err := getSubscriptionPathsToVolumeSources()
×
UNCOV
3465
                if err != nil {
×
3466
                        return fmt.Errorf("ERROR: failed to get path items for subscription entitlements: %v", err)
×
3467
                }
×
3468

3469
                // sort host path volumes to ensure ordering is preserved when adding to pod spec
3470
                mountPaths := make([]string, 0, len(pathToVolumeSource))
×
3471
                for k := range pathToVolumeSource {
×
3472
                        mountPaths = append(mountPaths, k)
×
UNCOV
3473
                }
×
UNCOV
3474
                sort.Strings(mountPaths)
×
UNCOV
3475

×
3476
                for num, mountPath := range mountPaths {
×
3477
                        volMountSubscriptionName := fmt.Sprintf("subscription-config-%d", num)
×
3478

×
UNCOV
3479
                        volMountSubscription := corev1.VolumeMount{
×
UNCOV
3480
                                Name:      volMountSubscriptionName,
×
UNCOV
3481
                                MountPath: mountPath,
×
UNCOV
3482
                                ReadOnly:  true,
×
UNCOV
3483
                        }
×
UNCOV
3484
                        driverContainer.VolumeMounts = append(driverContainer.VolumeMounts, volMountSubscription)
×
UNCOV
3485

×
UNCOV
3486
                        subscriptionVol := corev1.Volume{Name: volMountSubscriptionName, VolumeSource: pathToVolumeSource[mountPath]}
×
UNCOV
3487
                        podSpec.Volumes = append(podSpec.Volumes, subscriptionVol)
×
3488
                }
×
3489
        }
3490

3491
        // skip proxy and env settings if not ocp cluster
3492
        if _, ok := release["OPENSHIFT_VERSION"]; !ok {
2✔
3493
                return nil
1✔
3494
        }
1✔
3495

3496
        ocpVersion := corev1.EnvVar{Name: "OPENSHIFT_VERSION", Value: release["OPENSHIFT_VERSION"]}
×
3497
        driverContainer.Env = append(driverContainer.Env, ocpVersion)
×
UNCOV
3498

×
UNCOV
3499
        // Automatically apply proxy settings for OCP and inject custom CA if configured by user
×
UNCOV
3500
        // https://docs.openshift.com/container-platform/4.6/networking/configuring-a-custom-pki.html
×
UNCOV
3501
        err = applyOCPProxySpec(n, podSpec)
×
3502
        if err != nil {
×
3503
                return err
×
3504
        }
×
3505
        return nil
×
3506
}
3507

3508
func transformVGPUManagerContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1✔
3509
        var container *corev1.Container
1✔
3510
        for i, ctr := range obj.Spec.Template.Spec.Containers {
2✔
3511
                if ctr.Name == "nvidia-vgpu-manager-ctr" {
2✔
3512
                        container = &obj.Spec.Template.Spec.Containers[i]
1✔
3513
                        break
1✔
3514
                }
3515
        }
3516

3517
        if container == nil {
1✔
UNCOV
3518
                return fmt.Errorf("failed to find nvidia-vgpu-manager-ctr in spec")
×
UNCOV
3519
        }
×
3520

3521
        image, err := resolveDriverTag(n, &config.VGPUManager)
1✔
3522
        if err != nil {
1✔
3523
                return err
×
3524
        }
×
3525
        if image != "" {
2✔
3526
                container.Image = image
1✔
3527
        }
1✔
3528

3529
        // update image pull policy
3530
        container.ImagePullPolicy = gpuv1.ImagePullPolicy(config.VGPUManager.ImagePullPolicy)
1✔
3531

1✔
3532
        // set image pull secrets
1✔
3533
        if len(config.VGPUManager.ImagePullSecrets) > 0 {
2✔
3534
                addPullSecrets(&obj.Spec.Template.Spec, config.VGPUManager.ImagePullSecrets)
1✔
3535
        }
1✔
3536
        // set resource limits
3537
        if config.VGPUManager.Resources != nil {
1✔
3538
                container.Resources.Requests = config.VGPUManager.Resources.Requests
×
3539
                container.Resources.Limits = config.VGPUManager.Resources.Limits
×
3540
        }
×
3541
        // set arguments if specified for driver container
3542
        if len(config.VGPUManager.Args) > 0 {
1✔
3543
                container.Args = config.VGPUManager.Args
×
3544
        }
×
3545

3546
        release, err := parseOSRelease()
1✔
3547
        if err != nil {
1✔
UNCOV
3548
                return fmt.Errorf("ERROR: failed to get os-release: %s", err)
×
UNCOV
3549
        }
×
3550

3551
        // add env for OCP
3552
        if _, ok := release["OPENSHIFT_VERSION"]; ok {
1✔
UNCOV
3553
                setContainerEnv(container, "OPENSHIFT_VERSION", release["OPENSHIFT_VERSION"])
×
3554
        }
×
3555

3556
        if len(config.VGPUManager.Env) > 0 {
1✔
3557
                for _, env := range config.VGPUManager.Env {
×
3558
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
×
3559
                }
×
3560
        }
3561

3562
        return nil
1✔
3563
}
3564

3565
func applyUpdateStrategyConfig(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) error {
1✔
3566
        switch config.Daemonsets.UpdateStrategy {
1✔
3567
        case "OnDelete":
1✔
3568
                obj.Spec.UpdateStrategy = appsv1.DaemonSetUpdateStrategy{Type: appsv1.OnDeleteDaemonSetStrategyType}
1✔
3569
        case "RollingUpdate":
1✔
3570
                fallthrough
1✔
3571
        default:
1✔
3572
                // update config for RollingUpdate strategy
1✔
3573
                if config.Daemonsets.RollingUpdate == nil || config.Daemonsets.RollingUpdate.MaxUnavailable == "" {
2✔
3574
                        return nil
1✔
3575
                }
1✔
3576
                if strings.HasPrefix(obj.Name, commonDriverDaemonsetName) {
2✔
3577
                        // disallow setting RollingUpdate strategy with the driver container
1✔
3578
                        return nil
1✔
3579
                }
1✔
3580
                var intOrString intstr.IntOrString
1✔
3581
                if strings.HasSuffix(config.Daemonsets.RollingUpdate.MaxUnavailable, "%") {
2✔
3582
                        intOrString = intstr.IntOrString{Type: intstr.String, StrVal: config.Daemonsets.RollingUpdate.MaxUnavailable}
1✔
3583
                } else {
2✔
3584
                        int64Val, err := strconv.ParseInt(config.Daemonsets.RollingUpdate.MaxUnavailable, 10, 32)
1✔
3585
                        if err != nil {
2✔
3586
                                return fmt.Errorf("failed to apply rolling update config: %s", err)
1✔
3587
                        }
1✔
3588
                        intOrString = intstr.IntOrString{Type: intstr.Int, IntVal: int32(int64Val)}
1✔
3589
                }
3590
                rollingUpdateSpec := appsv1.RollingUpdateDaemonSet{MaxUnavailable: &intOrString}
1✔
3591
                obj.Spec.UpdateStrategy = appsv1.DaemonSetUpdateStrategy{Type: appsv1.RollingUpdateDaemonSetStrategyType, RollingUpdate: &rollingUpdateSpec}
1✔
3592
        }
3593
        return nil
1✔
3594
}
3595

3596
func transformValidationInitContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) error {
1✔
3597
        for i, initContainer := range obj.Spec.Template.Spec.InitContainers {
2✔
3598
                // skip if not validation initContainer
1✔
3599
                if !strings.Contains(initContainer.Name, "validation") {
2✔
3600
                        continue
1✔
3601
                }
3602

3603
                // TODO: refactor the component-specific validation logic so that we are not duplicating TransformValidatorComponent()
3604
                // Pass env for driver-validation init container
3605
                if strings.HasPrefix(initContainer.Name, "driver") {
2✔
3606
                        if len(config.Validator.Driver.Env) > 0 {
2✔
3607
                                for _, env := range config.Validator.Driver.Env {
2✔
3608
                                        setContainerEnv(&(obj.Spec.Template.Spec.InitContainers[i]), env.Name, env.Value)
1✔
3609
                                }
1✔
3610
                        }
3611
                }
3612

3613
                // Pass env for toolkit-validation init container
3614
                if strings.HasPrefix(initContainer.Name, "toolkit") {
2✔
3615
                        if len(config.Validator.Toolkit.Env) > 0 {
2✔
3616
                                for _, env := range config.Validator.Toolkit.Env {
2✔
3617
                                        setContainerEnv(&(obj.Spec.Template.Spec.InitContainers[i]), env.Name, env.Value)
1✔
3618
                                }
1✔
3619
                        }
3620
                }
3621

3622
                // update validation image
3623
                image, err := gpuv1.ImagePath(&config.Validator)
1✔
3624
                if err != nil {
1✔
UNCOV
3625
                        return err
×
UNCOV
3626
                }
×
3627
                obj.Spec.Template.Spec.InitContainers[i].Image = image
1✔
3628
                // update validation image pull policy
1✔
3629
                if config.Validator.ImagePullPolicy != "" {
2✔
3630
                        obj.Spec.Template.Spec.InitContainers[i].ImagePullPolicy = gpuv1.ImagePullPolicy(config.Validator.ImagePullPolicy)
1✔
3631
                }
1✔
3632
                // update the security context for the validator container
3633
                transformValidatorSecurityContext(&obj.Spec.Template.Spec.InitContainers[i])
1✔
3634
        }
3635
        // add any pull secrets needed for validation image
3636
        if len(config.Validator.ImagePullSecrets) > 0 {
2✔
3637
                addPullSecrets(&obj.Spec.Template.Spec, config.Validator.ImagePullSecrets)
1✔
3638
        }
1✔
3639
        return nil
1✔
3640
}
3641

3642
func addPullSecrets(podSpec *corev1.PodSpec, secrets []string) {
1✔
3643
        for _, secret := range secrets {
2✔
3644
                if !containsSecret(podSpec.ImagePullSecrets, secret) {
2✔
3645
                        podSpec.ImagePullSecrets = append(podSpec.ImagePullSecrets, corev1.LocalObjectReference{Name: secret})
1✔
3646
                }
1✔
3647
        }
3648
}
3649

3650
func containsSecret(secrets []corev1.LocalObjectReference, secretName string) bool {
1✔
3651
        for _, s := range secrets {
2✔
3652
                if s.Name == secretName {
2✔
3653
                        return true
1✔
3654
                }
1✔
3655
        }
3656
        return false
1✔
3657
}
3658

UNCOV
3659
func isDeploymentReady(name string, n ClusterPolicyController) gpuv1.State {
×
UNCOV
3660
        opts := []client.ListOption{
×
UNCOV
3661
                client.MatchingLabels{"app": name},
×
UNCOV
3662
        }
×
UNCOV
3663
        n.logger.V(1).Info("Deployment", "LabelSelector", fmt.Sprintf("app=%s", name))
×
UNCOV
3664
        list := &appsv1.DeploymentList{}
×
UNCOV
3665
        err := n.client.List(n.ctx, list, opts...)
×
UNCOV
3666
        if err != nil {
×
UNCOV
3667
                n.logger.Info("Could not get DeploymentList", err)
×
UNCOV
3668
        }
×
UNCOV
3669
        n.logger.V(1).Info("Deployment", "NumberOfDeployment", len(list.Items))
×
UNCOV
3670
        if len(list.Items) == 0 {
×
UNCOV
3671
                return gpuv1.NotReady
×
UNCOV
3672
        }
×
3673

UNCOV
3674
        ds := list.Items[0]
×
UNCOV
3675
        n.logger.V(1).Info("Deployment", "NumberUnavailable", ds.Status.UnavailableReplicas)
×
UNCOV
3676

×
UNCOV
3677
        if ds.Status.UnavailableReplicas != 0 {
×
UNCOV
3678
                return gpuv1.NotReady
×
UNCOV
3679
        }
×
3680

UNCOV
3681
        return isPodReady(name, n, "Running")
×
3682
}
3683

3684
func isDaemonSetReady(name string, n ClusterPolicyController) gpuv1.State {
1✔
3685
        ctx := n.ctx
1✔
3686
        ds := &appsv1.DaemonSet{}
1✔
3687
        n.logger.V(2).Info("checking daemonset for readiness", "name", name)
1✔
3688
        err := n.client.Get(ctx, types.NamespacedName{Namespace: n.operatorNamespace, Name: name}, ds)
1✔
3689
        if err != nil {
1✔
UNCOV
3690
                n.logger.Error(err, "could not get daemonset", "name", name)
×
UNCOV
3691
        }
×
3692

3693
        if ds.Status.DesiredNumberScheduled == 0 {
2✔
3694
                n.logger.V(2).Info("Daemonset has desired pods of 0", "name", name)
1✔
3695
                return gpuv1.Ready
1✔
3696
        }
1✔
3697

UNCOV
3698
        if ds.Status.NumberUnavailable != 0 {
×
UNCOV
3699
                n.logger.Info("daemonset not ready", "name", name)
×
UNCOV
3700
                return gpuv1.NotReady
×
UNCOV
3701
        }
×
3702

3703
        // if ds is running with "OnDelete" strategy, check if the revision matches for all pods
UNCOV
3704
        if ds.Spec.UpdateStrategy.Type != appsv1.OnDeleteDaemonSetStrategyType {
×
UNCOV
3705
                return gpuv1.Ready
×
UNCOV
3706
        }
×
3707

UNCOV
3708
        opts := []client.ListOption{client.MatchingLabels(ds.Spec.Template.Labels)}
×
UNCOV
3709

×
UNCOV
3710
        n.logger.V(2).Info("Pod", "LabelSelector", fmt.Sprintf("app=%s", name))
×
UNCOV
3711
        list := &corev1.PodList{}
×
UNCOV
3712
        err = n.client.List(ctx, list, opts...)
×
UNCOV
3713
        if err != nil {
×
UNCOV
3714
                n.logger.Info("Could not get PodList", err)
×
UNCOV
3715
                return gpuv1.NotReady
×
UNCOV
3716
        }
×
3717
        n.logger.V(2).Info("Pod", "NumberOfPods", len(list.Items))
×
3718
        if len(list.Items) == 0 {
×
3719
                return gpuv1.NotReady
×
3720
        }
×
3721

3722
        dsPods := getPodsOwnedbyDaemonset(ds, list.Items, n)
×
3723
        daemonsetRevisionHash, err := getDaemonsetControllerRevisionHash(ctx, ds, n)
×
3724
        if err != nil {
×
3725
                n.logger.Error(
×
3726
                        err, "Failed to get daemonset template revision hash", "daemonset", ds)
×
3727
                return gpuv1.NotReady
×
3728
        }
×
3729
        n.logger.V(2).Info("daemonset template revision hash", "hash", daemonsetRevisionHash)
×
3730

×
UNCOV
3731
        for _, pod := range dsPods {
×
3732
                pod := pod
×
3733
                podRevisionHash, err := getPodControllerRevisionHash(ctx, &pod)
×
3734
                if err != nil {
×
3735
                        n.logger.Error(
×
3736
                                err, "Failed to get pod template revision hash", "pod", pod)
×
3737
                        return gpuv1.NotReady
×
UNCOV
3738
                }
×
3739
                n.logger.V(2).Info("pod template revision hash", "hash", podRevisionHash)
×
UNCOV
3740

×
UNCOV
3741
                // check if the revision hashes are matching and pod is in running state
×
UNCOV
3742
                if podRevisionHash != daemonsetRevisionHash || pod.Status.Phase != "Running" {
×
UNCOV
3743
                        return gpuv1.NotReady
×
UNCOV
3744
                }
×
3745

3746
                // If the pod generation matches the daemonset generation and the pod is running
3747
                // and it has at least 1 container
3748
                if len(pod.Status.ContainerStatuses) != 0 {
×
3749
                        for i := range pod.Status.ContainerStatuses {
×
UNCOV
3750
                                if !pod.Status.ContainerStatuses[i].Ready {
×
UNCOV
3751
                                        // Return false if at least 1 container isn't ready
×
UNCOV
3752
                                        return gpuv1.NotReady
×
UNCOV
3753
                                }
×
3754
                        }
3755
                }
3756
        }
3757

3758
        // All containers are ready
3759
        return gpuv1.Ready
×
3760
}
3761

3762
func getPodsOwnedbyDaemonset(ds *appsv1.DaemonSet, pods []corev1.Pod, n ClusterPolicyController) []corev1.Pod {
×
3763
        dsPodList := []corev1.Pod{}
×
3764
        for _, pod := range pods {
×
UNCOV
3765
                if len(pod.OwnerReferences) < 1 {
×
3766
                        n.logger.Info("Driver Pod has no owner DaemonSet", "pod", pod.Name)
×
3767
                        continue
×
3768
                }
3769
                n.logger.V(2).Info("Pod", "pod", pod.Name, "owner", pod.OwnerReferences[0].Name)
×
3770

×
3771
                if ds.UID != pod.OwnerReferences[0].UID {
×
3772
                        n.logger.Info("Driver Pod is not owned by a Driver DaemonSet",
×
3773
                                "pod", pod, "actual owner", pod.OwnerReferences[0])
×
3774
                        continue
×
3775
                }
3776
                dsPodList = append(dsPodList, pod)
×
3777
        }
3778
        return dsPodList
×
3779
}
3780

3781
func getPodControllerRevisionHash(ctx context.Context, pod *corev1.Pod) (string, error) {
×
3782
        if hash, ok := pod.Labels[PodControllerRevisionHashLabelKey]; ok {
×
3783
                return hash, nil
×
3784
        }
×
3785
        return "", fmt.Errorf("controller-revision-hash label not present for pod %s", pod.Name)
×
3786
}
3787

3788
func getDaemonsetControllerRevisionHash(ctx context.Context, daemonset *appsv1.DaemonSet, n ClusterPolicyController) (string, error) {
×
3789

×
3790
        // get all revisions for the daemonset
×
3791
        opts := []client.ListOption{
×
3792
                client.MatchingLabels(daemonset.Spec.Selector.MatchLabels),
×
3793
                client.InNamespace(n.operatorNamespace),
×
3794
        }
×
3795
        list := &appsv1.ControllerRevisionList{}
×
3796
        err := n.client.List(ctx, list, opts...)
×
3797
        if err != nil {
×
3798
                return "", fmt.Errorf("error getting controller revision list for daemonset %s: %v", daemonset.Name, err)
×
3799
        }
×
3800

3801
        n.logger.V(2).Info("obtained controller revisions", "Daemonset", daemonset.Name, "len", len(list.Items))
×
3802

×
UNCOV
3803
        var revisions []appsv1.ControllerRevision
×
UNCOV
3804
        for _, controllerRevision := range list.Items {
×
UNCOV
3805
                if strings.HasPrefix(controllerRevision.Name, daemonset.Name) {
×
3806
                        revisions = append(revisions, controllerRevision)
×
3807
                }
×
3808
        }
3809

3810
        if len(revisions) == 0 {
×
3811
                return "", fmt.Errorf("no revision found for daemonset %s", daemonset.Name)
×
UNCOV
3812
        }
×
3813

3814
        // sort the revision list to make sure we obtain latest revision always
UNCOV
3815
        sort.Slice(revisions, func(i, j int) bool { return revisions[i].Revision < revisions[j].Revision })
×
3816

3817
        currentRevision := revisions[len(revisions)-1]
×
UNCOV
3818
        hash := strings.TrimPrefix(currentRevision.Name, fmt.Sprintf("%s-", daemonset.Name))
×
UNCOV
3819

×
3820
        return hash, nil
×
3821
}
3822

3823
// TransformDRADriverController transforms nvidia-dra-driver-controller deployment with required config as per ClusterPolicy
3824
func TransformDRADriverController(obj *appsv1.Deployment, spec *gpuv1.ClusterPolicySpec) error {
1✔
3825
        config := spec.DRADriver
1✔
3826
        image, err := gpuv1.ImagePath(&config)
1✔
3827
        if err != nil {
2✔
3828
                return err
1✔
3829
        }
1✔
3830
        obj.Spec.Template.Spec.Containers[0].Image = image
1✔
3831
        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "IMAGE_NAME", image)
1✔
3832

1✔
3833
        obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.ImagePullPolicy)
1✔
3834

1✔
3835
        if len(config.ImagePullSecrets) > 0 {
1✔
NEW
3836
                addPullSecrets(&obj.Spec.Template.Spec, config.ImagePullSecrets)
×
NEW
UNCOV
3837
        }
×
3838

3839
        if len(config.ComputeDomains.Controller.Tolerations) > 0 {
2✔
3840
                obj.Spec.Template.Spec.Tolerations = append(obj.Spec.Template.Spec.Tolerations, config.ComputeDomains.Controller.Tolerations...)
1✔
3841
        }
1✔
3842

3843
        if len(config.ComputeDomains.Controller.Env) > 0 {
2✔
3844
                for _, env := range config.ComputeDomains.Controller.Env {
2✔
3845
                        setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
1✔
3846
                }
1✔
3847
        }
3848

3849
        if config.ComputeDomains.Controller.Resources != nil {
2✔
3850
                obj.Spec.Template.Spec.Containers[0].Resources.Requests = config.ComputeDomains.Controller.Resources.Requests
1✔
3851
                obj.Spec.Template.Spec.Containers[0].Resources.Limits = config.ComputeDomains.Controller.Resources.Limits
1✔
3852
        }
1✔
3853

3854
        return nil
1✔
3855
}
3856

NEW
3857
func transformDeployment(obj *appsv1.Deployment, n ClusterPolicyController) error {
×
NEW
UNCOV
3858
        logger := n.logger.WithValues("Deployment", obj.Name, "Namespace", obj.Namespace)
×
NEW
3859
        switch obj.Name {
×
NEW
3860
        case "nvidia-dra-driver-controller":
×
NEW
3861
                return TransformDRADriverController(obj, &n.singleton.Spec)
×
NEW
3862
        default:
×
NEW
3863
                logger.Info("No transformation for object")
×
NEW
3864
                return nil
×
3865
        }
3866
}
3867

3868
// Deployment creates Deployment resource
3869
func Deployment(n ClusterPolicyController) (gpuv1.State, error) {
×
3870
        ctx := n.ctx
×
UNCOV
3871
        state := n.idx
×
NEW
3872
        stateName := n.stateNames[state]
×
3873
        obj := n.resources[state].Deployment.DeepCopy()
×
3874
        obj.Namespace = n.operatorNamespace
×
3875

×
3876
        logger := n.logger.WithValues("Deployment", obj.Name, "Namespace", obj.Namespace)
×
3877

×
3878
        // Check if state is disabled and cleanup resource if exists
×
NEW
3879
        if !n.isStateEnabled(stateName) || (obj.Name == "nvidia-dra-driver-controller" && !n.singleton.Spec.DRADriver.IsComputeDomainsEnabled()) {
×
3880
                err := n.client.Delete(ctx, obj)
×
3881
                if err != nil && !apierrors.IsNotFound(err) {
×
3882
                        logger.Info("Couldn't delete", "Error", err)
×
3883
                        return gpuv1.NotReady, err
×
3884
                }
×
3885
                return gpuv1.Disabled, nil
×
3886
        }
3887

NEW
3888
        if err := transformDeployment(obj, n); err != nil {
×
NEW
3889
                logger.Info("Failed to transform Deployment", "Error", err)
×
NEW
3890
                return gpuv1.NotReady, err
×
NEW
3891
        }
×
3892

3893
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
×
3894
                return gpuv1.NotReady, err
×
3895
        }
×
3896

3897
        if err := n.client.Create(ctx, obj); err != nil {
×
3898
                if apierrors.IsAlreadyExists(err) {
×
3899
                        logger.Info("Found Resource, updating...")
×
3900
                        err = n.client.Update(ctx, obj)
×
3901
                        if err != nil {
×
3902
                                logger.Info("Couldn't update", "Error", err)
×
3903
                                return gpuv1.NotReady, err
×
3904
                        }
×
3905
                        return isDeploymentReady(obj.Name, n), nil
×
3906
                }
3907

3908
                logger.Info("Couldn't create", "Error", err)
×
3909
                return gpuv1.NotReady, err
×
3910
        }
3911

3912
        return isDeploymentReady(obj.Name, n), nil
×
3913
}
3914

3915
func ocpHasDriverToolkitImageStream(n *ClusterPolicyController) (bool, error) {
×
3916
        ctx := n.ctx
×
3917
        found := &apiimagev1.ImageStream{}
×
3918
        name := "driver-toolkit"
×
3919
        namespace := consts.OpenshiftNamespace
×
3920
        err := n.client.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, found)
×
3921
        if err != nil {
×
3922
                if apierrors.IsNotFound(err) {
×
3923
                        n.logger.Info("ocpHasDriverToolkitImageStream: driver-toolkit imagestream not found",
×
3924
                                "Name", name,
×
3925
                                "Namespace", namespace)
×
3926

×
3927
                        return false, nil
×
3928
                }
×
3929

3930
                n.logger.Info("Couldn't get the driver-toolkit imagestream", "Error", err)
×
3931

×
UNCOV
3932
                return false, err
×
3933
        }
3934
        n.logger.V(1).Info("ocpHasDriverToolkitImageStream: driver-toolkit imagestream found")
×
3935
        isBroken := false
×
3936
        for _, tag := range found.Spec.Tags {
×
3937
                if tag.Name == "" {
×
UNCOV
3938
                        isBroken = true
×
UNCOV
3939
                        continue
×
3940
                }
3941
                if tag.Name == "latest" || tag.From == nil {
×
3942
                        continue
×
3943
                }
3944
                n.logger.V(1).Info("ocpHasDriverToolkitImageStream: tag", tag.Name, tag.From.Name)
×
3945
                n.ocpDriverToolkit.rhcosDriverToolkitImages[tag.Name] = tag.From.Name
×
3946
        }
3947
        if isBroken {
×
3948
                n.logger.Info("WARNING: ocpHasDriverToolkitImageStream: driver-toolkit imagestream is broken, see RHBZ#2015024")
×
3949

×
3950
                n.operatorMetrics.openshiftDriverToolkitIsBroken.Set(1)
×
3951
        } else {
×
3952
                n.operatorMetrics.openshiftDriverToolkitIsBroken.Set(0)
×
3953
        }
×
3954

3955
        return true, nil
×
3956
}
3957

3958
func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) error {
×
3959
        // Get all DaemonSets owned by ClusterPolicy
×
3960
        //
×
3961
        // (cdesiniotis) There is a limitation with the controller-runtime client where only a single field selector
×
UNCOV
3962
        // is allowed when specifying ListOptions or DeleteOptions.
×
UNCOV
3963
        // See GH issue: https://github.com/kubernetes-sigs/controller-runtime/issues/612
×
3964
        list := &appsv1.DaemonSetList{}
×
3965
        err := n.client.List(ctx, list, client.MatchingFields{clusterPolicyControllerIndexKey: n.singleton.Name})
×
3966
        if err != nil {
×
UNCOV
3967
                return fmt.Errorf("failed to list all NVIDIA driver daemonsets owned by ClusterPolicy: %w", err)
×
3968
        }
×
3969

3970
        for _, ds := range list.Items {
×
3971
                ds := ds
×
3972
                // filter out DaemonSets which are not the NVIDIA driver/vgpu-manager
×
3973
                if strings.HasPrefix(ds.Name, commonDriverDaemonsetName) || strings.HasPrefix(ds.Name, commonVGPUManagerDaemonsetName) {
×
3974
                        n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy", "Name", ds.Name)
×
3975
                        err = n.client.Delete(ctx, &ds)
×
3976
                        if err != nil {
×
UNCOV
3977
                                return fmt.Errorf("error deleting NVIDIA driver daemonset: %w", err)
×
UNCOV
3978
                        }
×
3979
                }
3980
        }
3981

UNCOV
3982
        return nil
×
3983
}
3984

3985
// cleanupStalePrecompiledDaemonsets deletes stale driver daemonsets which can happen
3986
// 1. If all nodes upgraded to the latest kernel
3987
// 2. no GPU nodes are present
3988
func (n ClusterPolicyController) cleanupStalePrecompiledDaemonsets(ctx context.Context) error {
1✔
3989
        opts := []client.ListOption{
1✔
3990
                client.MatchingLabels{
1✔
3991
                        precompiledIdentificationLabelKey: precompiledIdentificationLabelValue,
1✔
3992
                },
1✔
3993
        }
1✔
3994
        list := &appsv1.DaemonSetList{}
1✔
3995
        err := n.client.List(ctx, list, opts...)
1✔
3996
        if err != nil {
1✔
3997
                n.logger.Error(err, "could not get daemonset list")
×
3998
                return err
×
3999
        }
×
4000

4001
        for idx := range list.Items {
1✔
4002
                ds := list.Items[idx]
×
4003
                name := ds.Name
×
UNCOV
4004
                desiredNumberScheduled := ds.Status.DesiredNumberScheduled
×
4005
                numberMisscheduled := ds.Status.NumberMisscheduled
×
4006

×
4007
                n.logger.V(1).Info("Driver DaemonSet found",
×
4008
                        "Name", name,
×
4009
                        "Status.DesiredNumberScheduled", desiredNumberScheduled)
×
4010

×
UNCOV
4011
                // We consider a daemonset to be stale only if it has no desired number of pods and no pods currently mis-scheduled
×
4012
                // As per the Kubernetes docs, a daemonset pod is mis-scheduled when an already scheduled pod no longer satisfies
×
4013
                // node affinity constraints or has un-tolerated taints, for e.g. "node.kubernetes.io/unreachable:NoSchedule"
×
UNCOV
4014
                if desiredNumberScheduled == 0 && numberMisscheduled == 0 {
×
4015
                        n.logger.Info("Delete Driver DaemonSet", "Name", name)
×
4016

×
UNCOV
4017
                        err = n.client.Delete(ctx, &ds)
×
4018
                        if err != nil {
×
4019
                                n.logger.Error(err, "Could not get delete DaemonSet",
×
4020
                                        "Name", name)
×
4021
                        }
×
4022
                } else {
×
4023
                        n.logger.Info("Driver DaemonSet active, keep it.",
×
4024
                                "Name", name,
×
UNCOV
4025
                                "Status.DesiredNumberScheduled", desiredNumberScheduled)
×
4026
                }
×
4027
        }
4028
        return nil
1✔
4029
}
4030

4031
// precompiledDriverDaemonsets goes through all the kernel versions
4032
// found in the cluster, sets `currentKernelVersion` and calls the
4033
// original DaemonSet() function to create/update the kernel-specific
4034
// DaemonSet.
4035
func precompiledDriverDaemonsets(ctx context.Context, n ClusterPolicyController) (gpuv1.State, []error) {
1✔
4036
        overallState := gpuv1.Ready
1✔
4037
        var errs []error
1✔
4038
        n.logger.Info("cleaning any stale precompiled driver daemonsets")
1✔
4039
        err := n.cleanupStalePrecompiledDaemonsets(ctx)
1✔
4040
        if err != nil {
1✔
4041
                return gpuv1.NotReady, append(errs, err)
×
4042
        }
×
4043

4044
        n.logger.V(1).Info("preparing pre-compiled driver daemonsets")
1✔
4045
        for kernelVersion, os := range n.kernelVersionMap {
2✔
4046
                // set current kernel version
1✔
4047
                n.currentKernelVersion = kernelVersion
1✔
4048

1✔
4049
                n.logger.Info("preparing pre-compiled driver daemonset",
1✔
4050
                        "version", n.currentKernelVersion, "os", os)
1✔
4051

1✔
4052
                state, err := DaemonSet(n)
1✔
4053
                if state != gpuv1.Ready {
1✔
UNCOV
4054
                        n.logger.Info("pre-compiled driver daemonset not ready",
×
UNCOV
4055
                                "version", n.currentKernelVersion, "state", state)
×
UNCOV
4056
                        overallState = state
×
UNCOV
4057
                }
×
4058
                if err != nil {
1✔
UNCOV
4059
                        errs = append(errs, fmt.Errorf("failed to handle Precompiled Driver Daemonset for version %s: %v", kernelVersion, err))
×
UNCOV
4060
                }
×
4061
        }
4062

4063
        // reset current kernel version
4064
        n.currentKernelVersion = ""
1✔
4065
        return overallState, errs
1✔
4066
}
4067

4068
// ocpDriverToolkitDaemonSets goes through all the RHCOS versions
4069
// found in the cluster, sets `currentRhcosVersion` and calls the
4070
// original DaemonSet() function to create/update the RHCOS-specific
4071
// DaemonSet.
UNCOV
4072
func (n ClusterPolicyController) ocpDriverToolkitDaemonSets(ctx context.Context) (gpuv1.State, error) {
×
4073
        err := n.ocpCleanupStaleDriverToolkitDaemonSets(ctx)
×
4074
        if err != nil {
×
4075
                return gpuv1.NotReady, err
×
4076
        }
×
4077

4078
        n.logger.V(1).Info("preparing DriverToolkit DaemonSet",
×
4079
                "rhcos", n.ocpDriverToolkit.rhcosVersions)
×
4080

×
4081
        overallState := gpuv1.Ready
×
4082
        var errs error
×
4083

×
4084
        for rhcosVersion := range n.ocpDriverToolkit.rhcosVersions {
×
4085
                n.ocpDriverToolkit.currentRhcosVersion = rhcosVersion
×
4086

×
4087
                n.logger.V(1).Info("preparing DriverToolkit DaemonSet",
×
4088
                        "rhcosVersion", n.ocpDriverToolkit.currentRhcosVersion)
×
4089

×
4090
                state, err := DaemonSet(n)
×
4091

×
4092
                n.logger.V(1).Info("preparing DriverToolkit DaemonSet",
×
4093
                        "rhcosVersion", n.ocpDriverToolkit.currentRhcosVersion, "state", state)
×
4094
                if state != gpuv1.Ready {
×
4095
                        overallState = state
×
4096
                }
×
4097

UNCOV
4098
                if err != nil {
×
UNCOV
4099
                        if errs == nil {
×
UNCOV
4100
                                errs = err
×
UNCOV
4101
                        }
×
UNCOV
4102
                        errs = fmt.Errorf("failed to handle OpenShift Driver Toolkit Daemonset for version %s: %v", rhcosVersion, errs)
×
4103
                }
4104
        }
4105

UNCOV
4106
        n.ocpDriverToolkit.currentRhcosVersion = ""
×
UNCOV
4107

×
UNCOV
4108
        tagsMissing := false
×
UNCOV
4109
        for rhcosVersion, image := range n.ocpDriverToolkit.rhcosDriverToolkitImages {
×
UNCOV
4110
                if image != "" {
×
UNCOV
4111
                        continue
×
4112
                }
4113
                n.logger.Info("WARNINGs: RHCOS driver-toolkit image missing. Version-specific fallback mode enabled.", "rhcosVersion", rhcosVersion)
×
UNCOV
4114
                tagsMissing = true
×
4115
        }
UNCOV
4116
        if tagsMissing {
×
UNCOV
4117
                n.operatorMetrics.openshiftDriverToolkitRhcosTagsMissing.Set(1)
×
UNCOV
4118
        } else {
×
UNCOV
4119
                n.operatorMetrics.openshiftDriverToolkitRhcosTagsMissing.Set(0)
×
UNCOV
4120
        }
×
4121

UNCOV
4122
        return overallState, errs
×
4123
}
4124

4125
// ocpCleanupStaleDriverToolkitDaemonSets scans the DriverToolkit
4126
// RHCOS-version specific DaemonSets, and deletes the unused one:
4127
// - RHCOS version wasn't found in the node labels (upgrade finished)
4128
// - RHCOS version marked for deletion earlier in the Reconciliation loop (currently unexpected)
4129
// - no RHCOS version label (unexpected)
4130
// The DaemonSet set is kept if:
4131
// - RHCOS version was found in the node labels (most likely case)
UNCOV
4132
func (n ClusterPolicyController) ocpCleanupStaleDriverToolkitDaemonSets(ctx context.Context) error {
×
UNCOV
4133
        opts := []client.ListOption{
×
UNCOV
4134
                client.MatchingLabels{
×
UNCOV
4135
                        ocpDriverToolkitIdentificationLabel: ocpDriverToolkitIdentificationValue,
×
UNCOV
4136
                },
×
UNCOV
4137
        }
×
UNCOV
4138

×
UNCOV
4139
        list := &appsv1.DaemonSetList{}
×
UNCOV
4140
        err := n.client.List(ctx, list, opts...)
×
UNCOV
4141
        if err != nil {
×
UNCOV
4142
                n.logger.Info("ERROR: Could not get DaemonSetList", "Error", err)
×
4143
                return err
×
4144
        }
×
4145

4146
        for idx := range list.Items {
×
4147
                name := list.Items[idx].Name
×
UNCOV
4148
                dsRhcosVersion, versionOk := list.Items[idx].Labels[ocpDriverToolkitVersionLabel]
×
4149
                clusterHasRhcosVersion, clusterOk := n.ocpDriverToolkit.rhcosVersions[dsRhcosVersion]
×
4150
                desiredNumberScheduled := list.Items[idx].Status.DesiredNumberScheduled
×
4151

×
4152
                n.logger.V(1).Info("Driver DaemonSet found",
×
4153
                        "Name", name,
×
4154
                        "dsRhcosVersion", dsRhcosVersion,
×
4155
                        "clusterHasRhcosVersion", clusterHasRhcosVersion,
×
4156
                        "desiredNumberScheduled", desiredNumberScheduled)
×
4157

×
4158
                if desiredNumberScheduled != 0 {
×
4159
                        n.logger.Info("Driver DaemonSet active, keep it.",
×
4160
                                "Name", name, "Status.DesiredNumberScheduled", desiredNumberScheduled)
×
4161
                        continue
×
4162
                }
4163

4164
                if !versionOk {
×
4165
                        n.logger.Info("WARNING: Driver DaemonSet doesn't have DriverToolkit version label",
×
4166
                                "Name", name, "Label", ocpDriverToolkitVersionLabel,
×
4167
                        )
×
UNCOV
4168
                } else {
×
4169
                        switch {
×
4170
                        case !clusterOk:
×
4171
                                n.logger.V(1).Info("Driver DaemonSet RHCOS version NOT part of the cluster",
×
4172
                                        "Name", name, "RHCOS version", dsRhcosVersion,
×
4173
                                )
×
UNCOV
4174
                        case clusterHasRhcosVersion:
×
UNCOV
4175
                                n.logger.V(1).Info("Driver DaemonSet RHCOS version is part of the cluster, keep it.",
×
UNCOV
4176
                                        "Name", name, "RHCOS version", dsRhcosVersion,
×
4177
                                )
×
4178

×
4179
                                // the version of RHCOS targeted by this DS is part of the cluster
×
4180
                                // keep it alive
×
4181

×
4182
                                continue
×
UNCOV
4183
                        default: /* clusterHasRhcosVersion == false */
×
4184
                                // currently unexpected
×
4185
                                n.logger.V(1).Info("Driver DaemonSet RHCOS version marked for deletion",
×
UNCOV
4186
                                        "Name", name, "RHCOS version", dsRhcosVersion,
×
4187
                                )
×
4188
                        }
4189
                }
4190

4191
                n.logger.Info("Delete Driver DaemonSet", "Name", name)
×
UNCOV
4192
                err = n.client.Delete(ctx, &list.Items[idx])
×
4193
                if err != nil {
×
UNCOV
4194
                        n.logger.Info("ERROR: Could not get delete DaemonSet",
×
UNCOV
4195
                                "Name", name, "Error", err)
×
UNCOV
4196
                        return err
×
UNCOV
4197
                }
×
4198
        }
UNCOV
4199
        return nil
×
4200
}
4201

4202
// cleanupUnusedVGPUManagerDaemonsets cleans up the vgpu-manager DaemonSet(s)
4203
// according to the operator.useOCPDriverToolkit is enabled for ocp
4204
// This allows switching toggling the flag after the initial deployment.  If no
4205
// error happens, returns the number of Pods belonging to these
4206
// DaemonSets.
4207
func (n ClusterPolicyController) cleanupUnusedVGPUManagerDaemonsets(ctx context.Context) (int, error) {
1✔
4208
        podCount := 0
1✔
4209
        if n.openshift == "" {
2✔
4210
                return podCount, nil
1✔
4211
        }
1✔
4212

4213
        if !n.ocpDriverToolkit.enabled {
×
4214
                // cleanup DTK daemonsets
×
4215
                count, err := n.cleanupDriverDaemonsets(ctx,
×
UNCOV
4216
                        ocpDriverToolkitIdentificationLabel,
×
4217
                        ocpDriverToolkitIdentificationValue, commonVGPUManagerDaemonsetName)
×
4218
                if err != nil {
×
4219
                        return 0, err
×
4220
                }
×
4221
                podCount = count
×
4222
        } else {
×
4223
                // cleanup legacy vgpu-manager daemonsets
×
4224
                count, err := n.cleanupDriverDaemonsets(ctx,
×
4225
                        appLabelKey,
×
4226
                        commonVGPUManagerDaemonsetName, commonVGPUManagerDaemonsetName)
×
4227
                if err != nil {
×
4228
                        return 0, err
×
4229
                }
×
4230
                podCount = count
×
4231
        }
4232
        return podCount, nil
×
4233
}
4234

4235
// cleanupUnusedDriverDaemonSets cleans up the driver DaemonSet(s)
4236
// according to following.
4237
// 1. If driver.usePrecompiled is enabled
4238
// 2. if operator.useOCPDriverToolkit is enabled for ocp
4239
// This allows switching toggling the flag after the initial deployment.  If no
4240
// error happens, returns the number of Pods belonging to these
4241
// DaemonSets.
4242
func (n ClusterPolicyController) cleanupUnusedDriverDaemonSets(ctx context.Context) (int, error) {
1✔
4243
        podCount := 0
1✔
4244
        if n.openshift != "" {
1✔
4245
                switch {
×
4246
                case n.singleton.Spec.Driver.UsePrecompiledDrivers():
×
4247
                        // cleanup DTK daemonsets
×
4248
                        count, err := n.cleanupDriverDaemonsets(ctx,
×
4249
                                ocpDriverToolkitIdentificationLabel,
×
4250
                                ocpDriverToolkitIdentificationValue, commonDriverDaemonsetName)
×
4251
                        if err != nil {
×
4252
                                return 0, err
×
4253
                        }
×
4254
                        podCount = count
×
4255
                        // cleanup legacy driver daemonsets that use run file
×
4256
                        count, err = n.cleanupDriverDaemonsets(ctx,
×
4257
                                precompiledIdentificationLabelKey,
×
4258
                                "false", commonDriverDaemonsetName)
×
UNCOV
4259
                        if err != nil {
×
UNCOV
4260
                                return 0, err
×
UNCOV
4261
                        }
×
4262
                        podCount += count
×
4263

4264
                case n.ocpDriverToolkit.enabled:
×
4265
                        // cleanup pre-compiled and legacy driver daemonsets
×
4266
                        count, err := n.cleanupDriverDaemonsets(ctx,
×
4267
                                appLabelKey,
×
4268
                                commonDriverDaemonsetName, commonDriverDaemonsetName)
×
UNCOV
4269
                        if err != nil {
×
4270
                                return 0, err
×
UNCOV
4271
                        }
×
UNCOV
4272
                        podCount = count
×
UNCOV
4273
                default:
×
UNCOV
4274
                        // cleanup pre-compiled
×
UNCOV
4275
                        count, err := n.cleanupDriverDaemonsets(ctx,
×
UNCOV
4276
                                precompiledIdentificationLabelKey,
×
UNCOV
4277
                                precompiledIdentificationLabelValue, commonDriverDaemonsetName)
×
UNCOV
4278
                        if err != nil {
×
UNCOV
4279
                                return 0, err
×
UNCOV
4280
                        }
×
UNCOV
4281
                        podCount = count
×
UNCOV
4282

×
UNCOV
4283
                        // cleanup DTK daemonsets
×
4284
                        count, err = n.cleanupDriverDaemonsets(ctx,
×
4285
                                ocpDriverToolkitIdentificationLabel,
×
4286
                                ocpDriverToolkitIdentificationValue, commonDriverDaemonsetName)
×
4287
                        if err != nil {
×
4288
                                return 0, err
×
4289
                        }
×
4290
                        podCount += count
×
4291
                }
4292
        } else {
1✔
4293
                if n.singleton.Spec.Driver.UsePrecompiledDrivers() {
2✔
4294
                        // cleanup legacy driver daemonsets that use run file
1✔
4295
                        count, err := n.cleanupDriverDaemonsets(ctx,
1✔
4296
                                precompiledIdentificationLabelKey,
1✔
4297
                                "false", commonDriverDaemonsetName)
1✔
4298
                        if err != nil {
1✔
4299
                                return 0, err
×
4300
                        }
×
4301
                        podCount = count
1✔
4302
                } else {
1✔
4303
                        // cleanup pre-compiled driver daemonsets
1✔
4304
                        count, err := n.cleanupDriverDaemonsets(ctx,
1✔
4305
                                precompiledIdentificationLabelKey,
1✔
4306
                                precompiledIdentificationLabelValue, commonDriverDaemonsetName)
1✔
4307
                        if err != nil {
1✔
UNCOV
4308
                                return 0, err
×
UNCOV
4309
                        }
×
4310
                        podCount = count
1✔
4311
                }
4312
        }
4313
        return podCount, nil
1✔
4314
}
4315

4316
// cleanupDriverDaemonSets deletes the DaemonSets matching a given key/value
4317
// pairs If no error happens, returns the number of Pods belonging to
4318
// the DaemonSet.
4319
func (n ClusterPolicyController) cleanupDriverDaemonsets(ctx context.Context, searchKey string, searchValue string, namePrefix string) (int, error) {
1✔
4320
        var opts = []client.ListOption{client.MatchingLabels{searchKey: searchValue}}
1✔
4321

1✔
4322
        dsList := &appsv1.DaemonSetList{}
1✔
4323
        if err := n.client.List(ctx, dsList, opts...); err != nil {
1✔
4324
                n.logger.Error(err, "Could not get DaemonSetList")
×
4325
                return 0, err
×
4326
        }
×
4327

4328
        var lastErr error
1✔
4329
        for idx := range dsList.Items {
1✔
4330
                n.logger.Info("Delete DaemonSet",
×
4331
                        "Name", dsList.Items[idx].Name,
×
4332
                )
×
4333
                // ignore daemonsets that doesn't match the required name
×
UNCOV
4334
                if !strings.HasPrefix(dsList.Items[idx].Name, namePrefix) {
×
4335
                        continue
×
4336
                }
4337
                if err := n.client.Delete(ctx, &dsList.Items[idx]); err != nil {
×
4338
                        n.logger.Error(err, "Could not get delete DaemonSet",
×
4339
                                "Name", dsList.Items[idx].Name)
×
4340
                        lastErr = err
×
4341
                }
×
4342
        }
4343

4344
        // return the last error that occurred, if any
4345
        if lastErr != nil {
1✔
4346
                return 0, lastErr
×
4347
        }
×
4348

4349
        podList := &corev1.PodList{}
1✔
4350
        if err := n.client.List(ctx, podList, opts...); err != nil {
1✔
4351
                n.logger.Info("ERROR: Could not get PodList", "Error", err)
×
4352
                return 0, err
×
4353
        }
×
4354

4355
        podCount := 0
1✔
4356
        for idx := range podList.Items {
1✔
4357
                // ignore pods that doesn't match the required name
×
4358
                if !strings.HasPrefix(podList.Items[idx].Name, namePrefix) {
×
4359
                        continue
×
4360
                }
4361
                podCount++
×
4362
        }
4363
        return podCount, nil
1✔
4364
}
4365

4366
// DaemonSet creates Daemonset resource
4367
func DaemonSet(n ClusterPolicyController) (gpuv1.State, error) {
1✔
4368
        ctx := n.ctx
1✔
4369
        state := n.idx
1✔
4370
        obj := n.resources[state].DaemonSet.DeepCopy()
1✔
4371
        obj.Namespace = n.operatorNamespace
1✔
4372

1✔
4373
        logger := n.logger.WithValues("DaemonSet", obj.Name, "Namespace", obj.Namespace)
1✔
4374

1✔
4375
        // Check if state is disabled and cleanup resource if exists
1✔
4376
        if !n.isStateEnabled(n.stateNames[n.idx]) {
1✔
UNCOV
4377
                err := n.client.Delete(ctx, obj)
×
UNCOV
4378
                if err != nil && !apierrors.IsNotFound(err) {
×
4379
                        logger.Info("Couldn't delete", "Error", err)
×
4380
                        return gpuv1.NotReady, err
×
UNCOV
4381
                }
×
UNCOV
4382
                return gpuv1.Disabled, nil
×
4383
        }
4384

4385
        if !n.hasGPUNodes {
1✔
UNCOV
4386
                // multiple DaemonSets (eg, driver, dgcm-exporter) cannot be
×
UNCOV
4387
                // deployed without knowing the OS name, so skip their
×
UNCOV
4388
                // deployment for now. The operator will be notified
×
UNCOV
4389
                // (addWatchNewGPUNode) when new nodes will join the cluster.
×
UNCOV
4390
                logger.Info("No GPU node in the cluster, do not create DaemonSets")
×
UNCOV
4391
                return gpuv1.Ready, nil
×
UNCOV
4392
        }
×
4393

4394
        if n.resources[state].DaemonSet.GetName() == commonDriverDaemonsetName {
2✔
4395
                podCount, err := n.cleanupUnusedDriverDaemonSets(n.ctx)
1✔
4396
                if err != nil {
1✔
4397
                        return gpuv1.NotReady, err
×
UNCOV
4398
                }
×
4399
                if podCount != 0 {
1✔
UNCOV
4400
                        logger.Info("Driver DaemonSet cleanup in progress", "podCount", podCount)
×
4401
                        return gpuv1.NotReady, nil
×
4402
                }
×
4403

4404
                // Daemonsets using pre-compiled packages or using driver-toolkit (openshift) require creation of
4405
                // one daemonset per kernel version (or rhcos version).
4406
                // If currentKernelVersion or currentRhcosVersion (ocp) are not set, we intercept here
4407
                // and call Daemonset() per specific version
4408
                if n.singleton.Spec.Driver.UsePrecompiledDrivers() {
2✔
4409
                        if n.currentKernelVersion == "" {
2✔
4410
                                overallState, errs := precompiledDriverDaemonsets(ctx, n)
1✔
4411
                                if len(errs) != 0 {
1✔
4412
                                        // log errors
×
UNCOV
4413
                                        return overallState, fmt.Errorf("unable to deploy precompiled driver daemonsets %v", errs)
×
UNCOV
4414
                                }
×
4415
                                return overallState, nil
1✔
4416
                        }
4417
                } else if n.openshift != "" && n.ocpDriverToolkit.enabled &&
1✔
4418
                        n.ocpDriverToolkit.currentRhcosVersion == "" {
1✔
UNCOV
4419
                        return n.ocpDriverToolkitDaemonSets(ctx)
×
UNCOV
4420
                }
×
4421
        } else if n.resources[state].DaemonSet.Name == commonVGPUManagerDaemonsetName {
2✔
4422
                podCount, err := n.cleanupUnusedVGPUManagerDaemonsets(ctx)
1✔
4423
                if err != nil {
1✔
4424
                        return gpuv1.NotReady, err
×
UNCOV
4425
                }
×
4426
                if podCount != 0 {
1✔
UNCOV
4427
                        logger.Info("Driver DaemonSet cleanup in progress", "podCount", podCount)
×
4428
                        return gpuv1.NotReady, nil
×
4429
                }
×
4430
                if n.openshift != "" && n.ocpDriverToolkit.enabled &&
1✔
4431
                        n.ocpDriverToolkit.currentRhcosVersion == "" {
1✔
4432
                        // OpenShift Driver Toolkit requires the creation of
×
UNCOV
4433
                        // one Driver DaemonSet per RHCOS version (stored in
×
UNCOV
4434
                        // n.ocpDriverToolkit.rhcosVersions).
×
UNCOV
4435
                        //
×
UNCOV
4436
                        // Here, we are at the top-most call of DaemonSet(),
×
UNCOV
4437
                        // as currentRhcosVersion is unset.
×
UNCOV
4438
                        //
×
UNCOV
4439
                        // Initiate the multi-DaemonSet OCP DriverToolkit
×
UNCOV
4440
                        // deployment.
×
UNCOV
4441
                        return n.ocpDriverToolkitDaemonSets(ctx)
×
UNCOV
4442
                }
×
4443
        }
4444

4445
        err := preProcessDaemonSet(obj, n)
1✔
4446
        if err != nil {
1✔
UNCOV
4447
                logger.Info("Could not pre-process", "Error", err)
×
4448
                return gpuv1.NotReady, err
×
4449
        }
×
4450

4451
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
4452
                logger.Info("SetControllerReference failed", "Error", err)
×
4453
                return gpuv1.NotReady, err
×
UNCOV
4454
        }
×
4455

4456
        if obj.Labels == nil {
1✔
4457
                obj.Labels = make(map[string]string)
×
4458
        }
×
4459

4460
        for labelKey, labelValue := range n.singleton.Spec.Daemonsets.Labels {
1✔
4461
                obj.Labels[labelKey] = labelValue
×
4462
        }
×
4463

4464
        // Daemonsets will always have at least one annotation applied, so allocate if necessary
4465
        if obj.Annotations == nil {
1✔
UNCOV
4466
                obj.Annotations = make(map[string]string)
×
UNCOV
4467
        }
×
4468

4469
        for annoKey, annoValue := range n.singleton.Spec.Daemonsets.Annotations {
1✔
UNCOV
4470
                obj.Annotations[annoKey] = annoValue
×
4471
        }
×
4472

4473
        found := &appsv1.DaemonSet{}
1✔
4474
        err = n.client.Get(ctx, types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
1✔
4475
        if err != nil && apierrors.IsNotFound(err) {
2✔
4476
                logger.Info("DaemonSet not found, creating",
1✔
4477
                        "Name", obj.Name,
1✔
4478
                )
1✔
4479
                // generate hash for the spec to create
1✔
4480
                hashStr := utils.GetObjectHash(obj)
1✔
4481
                // add annotation to the Daemonset with hash value during creation
1✔
4482
                obj.Annotations[NvidiaAnnotationHashKey] = hashStr
1✔
4483
                err = n.client.Create(ctx, obj)
1✔
4484
                if err != nil {
1✔
4485
                        logger.Info("Couldn't create DaemonSet",
×
UNCOV
4486
                                "Name", obj.Name,
×
UNCOV
4487
                                "Error", err,
×
UNCOV
4488
                        )
×
UNCOV
4489
                        return gpuv1.NotReady, err
×
4490
                }
×
4491
                return isDaemonSetReady(obj.Name, n), nil
1✔
UNCOV
4492
        } else if err != nil {
×
UNCOV
4493
                logger.Info("Failed to get DaemonSet from client",
×
UNCOV
4494
                        "Name", obj.Name,
×
4495
                        "Error", err.Error())
×
4496
                return gpuv1.NotReady, err
×
UNCOV
4497
        }
×
4498

4499
        changed := isDaemonsetSpecChanged(found, obj)
×
4500
        if changed {
×
UNCOV
4501
                logger.Info("DaemonSet is different, updating", "name", obj.Name)
×
UNCOV
4502
                err = n.client.Update(ctx, obj)
×
4503
                if err != nil {
×
4504
                        return gpuv1.NotReady, err
×
4505
                }
×
4506
        } else {
×
4507
                logger.Info("DaemonSet identical, skipping update", "name", obj.Name)
×
4508
        }
×
4509
        return isDaemonSetReady(obj.Name, n), nil
×
4510
}
4511

4512
// isDaemonsetSpecChanged returns true if the spec has changed between existing one
4513
// and new Daemonset spec compared by hash.
UNCOV
4514
func isDaemonsetSpecChanged(current *appsv1.DaemonSet, new *appsv1.DaemonSet) bool {
×
UNCOV
4515
        if current == nil && new != nil {
×
UNCOV
4516
                return true
×
UNCOV
4517
        }
×
4518
        if current.Annotations == nil || new.Annotations == nil {
×
4519
                panic("appsv1.DaemonSet.Annotations must be allocated prior to calling isDaemonsetSpecChanged()")
×
4520
        }
4521

UNCOV
4522
        hashStr := utils.GetObjectHash(new)
×
4523
        foundHashAnnotation := false
×
4524

×
4525
        for annotation, value := range current.Annotations {
×
UNCOV
4526
                if annotation == NvidiaAnnotationHashKey {
×
UNCOV
4527
                        if value != hashStr {
×
4528
                                // update annotation to be added to Daemonset as per new spec and indicate spec update is required
×
4529
                                new.Annotations[NvidiaAnnotationHashKey] = hashStr
×
UNCOV
4530
                                return true
×
UNCOV
4531
                        }
×
4532
                        foundHashAnnotation = true
×
4533
                        break
×
4534
                }
4535
        }
4536

4537
        if !foundHashAnnotation {
×
4538
                // update annotation to be added to Daemonset as per new spec and indicate spec update is required
×
UNCOV
4539
                new.Annotations[NvidiaAnnotationHashKey] = hashStr
×
UNCOV
4540
                return true
×
4541
        }
×
4542
        return false
×
4543
}
4544

4545
// The operator starts two pods in different stages to validate
4546
// the correct working of the DaemonSets (driver and dp). Therefore
4547
// the operator waits until the Pod completes and checks the error status
4548
// to advance to the next state.
UNCOV
4549
func isPodReady(name string, n ClusterPolicyController, phase corev1.PodPhase) gpuv1.State {
×
UNCOV
4550
        ctx := n.ctx
×
UNCOV
4551
        opts := []client.ListOption{&client.MatchingLabels{"app": name}}
×
UNCOV
4552

×
UNCOV
4553
        n.logger.V(1).Info("Pod", "LabelSelector", fmt.Sprintf("app=%s", name))
×
UNCOV
4554
        list := &corev1.PodList{}
×
UNCOV
4555
        err := n.client.List(ctx, list, opts...)
×
4556
        if err != nil {
×
4557
                n.logger.Info("Could not get PodList", err)
×
4558
        }
×
4559
        n.logger.V(1).Info("Pod", "NumberOfPods", len(list.Items))
×
4560
        if len(list.Items) == 0 {
×
4561
                return gpuv1.NotReady
×
UNCOV
4562
        }
×
4563

4564
        pd := list.Items[0]
×
4565

×
4566
        if pd.Status.Phase != phase {
×
4567
                n.logger.V(1).Info("Pod", "Phase", pd.Status.Phase, "!=", phase)
×
4568
                return gpuv1.NotReady
×
UNCOV
4569
        }
×
4570
        n.logger.V(1).Info("Pod", "Phase", pd.Status.Phase, "==", phase)
×
4571
        return gpuv1.Ready
×
4572
}
4573

4574
// SecurityContextConstraints creates SCC resources
4575
func SecurityContextConstraints(n ClusterPolicyController) (gpuv1.State, error) {
×
4576
        ctx := n.ctx
×
4577
        state := n.idx
×
4578
        obj := n.resources[state].SecurityContextConstraints.DeepCopy()
×
4579
        obj.Namespace = n.operatorNamespace
×
4580

×
UNCOV
4581
        logger := n.logger.WithValues("SecurityContextConstraints", obj.Name, "Namespace", "default")
×
UNCOV
4582

×
UNCOV
4583
        // Check if state is disabled and cleanup resource if exists
×
UNCOV
4584
        if !n.isStateEnabled(n.stateNames[n.idx]) {
×
4585
                err := n.client.Delete(ctx, obj)
×
4586
                if err != nil && !apierrors.IsNotFound(err) {
×
4587
                        logger.Info("Couldn't delete", "Error", err)
×
4588
                        return gpuv1.NotReady, err
×
4589
                }
×
4590
                return gpuv1.Disabled, nil
×
4591
        }
4592

4593
        for idx := range obj.Users {
×
4594
                if obj.Users[idx] != "FILLED BY THE OPERATOR" {
×
4595
                        continue
×
4596
                }
4597
                obj.Users[idx] = fmt.Sprintf("system:serviceaccount:%s:%s", obj.Namespace, obj.Name)
×
4598
        }
4599

4600
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
×
4601
                return gpuv1.NotReady, err
×
4602
        }
×
4603

4604
        found := &secv1.SecurityContextConstraints{}
×
UNCOV
4605
        err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found)
×
UNCOV
4606
        if err != nil && apierrors.IsNotFound(err) {
×
UNCOV
4607
                logger.Info("Not found, creating...")
×
4608
                err = n.client.Create(ctx, obj)
×
4609
                if err != nil {
×
4610
                        logger.Info("Couldn't create", "Error", err)
×
4611
                        return gpuv1.NotReady, err
×
4612
                }
×
4613
                return gpuv1.Ready, nil
×
UNCOV
4614
        } else if err != nil {
×
UNCOV
4615
                return gpuv1.NotReady, err
×
UNCOV
4616
        }
×
4617

UNCOV
4618
        logger.Info("Found Resource, updating...")
×
UNCOV
4619
        obj.ResourceVersion = found.ResourceVersion
×
4620

×
4621
        err = n.client.Update(ctx, obj)
×
4622
        if err != nil {
×
4623
                logger.Info("Couldn't update", "Error", err)
×
4624
                return gpuv1.NotReady, err
×
4625
        }
×
4626
        return gpuv1.Ready, nil
×
4627
}
4628

4629
// Service creates Service object
4630
func Service(n ClusterPolicyController) (gpuv1.State, error) {
1✔
4631
        ctx := n.ctx
1✔
4632
        state := n.idx
1✔
4633
        obj := n.resources[state].Service.DeepCopy()
1✔
4634

1✔
4635
        obj.Namespace = n.operatorNamespace
1✔
4636

1✔
4637
        logger := n.logger.WithValues("Service", obj.Name, "Namespace", obj.Namespace)
1✔
4638

1✔
4639
        // Check if state is disabled and cleanup resource if exists
1✔
4640
        if !n.isStateEnabled(n.stateNames[n.idx]) {
1✔
4641
                err := n.client.Delete(ctx, obj)
×
4642
                if err != nil && !apierrors.IsNotFound(err) {
×
UNCOV
4643
                        logger.Info("Couldn't delete", "Error", err)
×
UNCOV
4644
                        return gpuv1.NotReady, err
×
UNCOV
4645
                }
×
4646
                return gpuv1.Disabled, nil
×
4647
        }
4648

4649
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
1✔
4650
                return gpuv1.NotReady, err
×
4651
        }
×
4652

4653
        err := preprocessService(obj, n)
1✔
4654
        if err != nil {
1✔
4655
                logger.Info("Couldn't preprocess Service", "Error", err)
×
4656
                return gpuv1.NotReady, err
×
4657
        }
×
4658

4659
        found := &corev1.Service{}
1✔
4660
        err = n.client.Get(ctx, types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
1✔
4661
        if err != nil && apierrors.IsNotFound(err) {
2✔
4662
                logger.Info("Not found, creating...")
1✔
4663
                err = n.client.Create(ctx, obj)
1✔
4664
                if err != nil {
1✔
4665
                        logger.Info("Couldn't create", "Error", err)
×
4666
                        return gpuv1.NotReady, err
×
UNCOV
4667
                }
×
4668
                return gpuv1.Ready, nil
1✔
UNCOV
4669
        } else if err != nil {
×
UNCOV
4670
                return gpuv1.NotReady, err
×
4671
        }
×
4672

4673
        logger.Info("Found Resource, updating...")
×
UNCOV
4674
        obj.ResourceVersion = found.ResourceVersion
×
4675
        obj.Spec.ClusterIP = found.Spec.ClusterIP
×
4676

×
4677
        err = n.client.Update(ctx, obj)
×
4678
        if err != nil {
×
4679
                logger.Info("Couldn't update", "Error", err)
×
4680
                return gpuv1.NotReady, err
×
4681
        }
×
4682
        return gpuv1.Ready, nil
×
4683
}
4684

4685
func crdExists(n ClusterPolicyController, name string) (bool, error) {
1✔
4686
        crd := &apiextensionsv1.CustomResourceDefinition{}
1✔
4687
        err := n.client.Get(n.ctx, client.ObjectKey{Name: name}, crd)
1✔
4688
        if err != nil && apierrors.IsNotFound(err) {
2✔
4689
                return false, nil
1✔
4690
        } else if err != nil {
1✔
4691
                return false, err
×
4692
        }
×
4693

4694
        return true, nil
×
4695
}
4696

4697
// ServiceMonitor creates ServiceMonitor object
4698
func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) {
1✔
4699
        ctx := n.ctx
1✔
4700
        state := n.idx
1✔
4701
        obj := n.resources[state].ServiceMonitor.DeepCopy()
1✔
4702
        obj.Namespace = n.operatorNamespace
1✔
4703

1✔
4704
        logger := n.logger.WithValues("ServiceMonitor", obj.Name, "Namespace", obj.Namespace)
1✔
4705

1✔
4706
        // Check if ServiceMonitor is a valid kind
1✔
4707
        serviceMonitorCRDExists, err := crdExists(n, ServiceMonitorCRDName)
1✔
4708
        if err != nil {
1✔
UNCOV
4709
                return gpuv1.NotReady, err
×
UNCOV
4710
        }
×
4711

4712
        // Check if state is disabled and cleanup resource if exists
4713
        if !n.isStateEnabled(n.stateNames[state]) {
1✔
4714
                if !serviceMonitorCRDExists {
×
4715
                        return gpuv1.Ready, nil
×
4716
                }
×
4717
                err := n.client.Delete(ctx, obj)
×
UNCOV
4718
                if err != nil && !apierrors.IsNotFound(err) {
×
UNCOV
4719
                        logger.Info("Couldn't delete", "Error", err)
×
UNCOV
4720
                        return gpuv1.NotReady, err
×
4721
                }
×
4722
                return gpuv1.Disabled, nil
×
4723
        }
4724

4725
        if n.stateNames[state] == "state-dcgm-exporter" {
2✔
4726
                serviceMonitor := n.singleton.Spec.DCGMExporter.ServiceMonitor
1✔
4727
                // Check if ServiceMonitor is disabled and cleanup resource if exists
1✔
4728
                if serviceMonitor == nil || !serviceMonitor.IsEnabled() {
2✔
4729
                        if !serviceMonitorCRDExists {
2✔
4730
                                return gpuv1.Ready, nil
1✔
4731
                        }
1✔
UNCOV
4732
                        err := n.client.Delete(ctx, obj)
×
UNCOV
4733
                        if err != nil && !apierrors.IsNotFound(err) {
×
UNCOV
4734
                                logger.Info("Couldn't delete", "Error", err)
×
UNCOV
4735
                                return gpuv1.NotReady, err
×
4736
                        }
×
4737
                        return gpuv1.Disabled, nil
×
4738
                }
4739

4740
                if !serviceMonitorCRDExists {
×
4741
                        logger.Error(fmt.Errorf("couldn't find ServiceMonitor CRD"), "Install Prometheus and necessary CRDs for gathering GPU metrics!")
×
4742
                        return gpuv1.NotReady, nil
×
UNCOV
4743
                }
×
4744

4745
                // Apply custom edits for DCGM Exporter
4746
                if serviceMonitor.Interval != "" {
×
4747
                        obj.Spec.Endpoints[0].Interval = serviceMonitor.Interval
×
4748
                }
×
4749

4750
                if serviceMonitor.HonorLabels != nil {
×
4751
                        obj.Spec.Endpoints[0].HonorLabels = *serviceMonitor.HonorLabels
×
4752
                }
×
4753

UNCOV
4754
                if serviceMonitor.AdditionalLabels != nil {
×
UNCOV
4755
                        for key, value := range serviceMonitor.AdditionalLabels {
×
UNCOV
4756
                                obj.Labels[key] = value
×
UNCOV
4757
                        }
×
4758
                }
UNCOV
4759
                if serviceMonitor.Relabelings != nil {
×
UNCOV
4760
                        relabelConfigs := make([]promv1.RelabelConfig, len(serviceMonitor.Relabelings))
×
UNCOV
4761
                        for i, relabel := range serviceMonitor.Relabelings {
×
4762
                                if relabel != nil {
×
4763
                                        relabelConfigs[i] = *relabel
×
UNCOV
4764
                                }
×
4765
                        }
UNCOV
4766
                        obj.Spec.Endpoints[0].RelabelConfigs = relabelConfigs
×
4767
                }
4768
        }
UNCOV
4769
        if n.stateNames[state] == "state-operator-metrics" || n.stateNames[state] == "state-node-status-exporter" {
×
UNCOV
4770
                // if ServiceMonitor CRD is missing, assume prometheus is not setup and ignore CR creation
×
UNCOV
4771
                if !serviceMonitorCRDExists {
×
UNCOV
4772
                        logger.V(1).Info("ServiceMonitor CRD is missing, ignoring creation of CR for operator-metrics")
×
UNCOV
4773
                        return gpuv1.Ready, nil
×
UNCOV
4774
                }
×
UNCOV
4775
                obj.Spec.NamespaceSelector.MatchNames = []string{obj.Namespace}
×
4776
        }
4777

UNCOV
4778
        for idx := range obj.Spec.NamespaceSelector.MatchNames {
×
UNCOV
4779
                if obj.Spec.NamespaceSelector.MatchNames[idx] != "FILLED BY THE OPERATOR" {
×
4780
                        continue
×
4781
                }
UNCOV
4782
                obj.Spec.NamespaceSelector.MatchNames[idx] = obj.Namespace
×
4783
        }
4784

4785
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
×
4786
                return gpuv1.NotReady, err
×
4787
        }
×
4788

4789
        found := &promv1.ServiceMonitor{}
×
4790
        err = n.client.Get(ctx, types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
×
4791
        if err != nil && apierrors.IsNotFound(err) {
×
4792
                logger.Info("Not found, creating...")
×
4793
                err = n.client.Create(ctx, obj)
×
UNCOV
4794
                if err != nil {
×
UNCOV
4795
                        logger.Info("Couldn't create", "Error", err)
×
UNCOV
4796
                        return gpuv1.NotReady, err
×
UNCOV
4797
                }
×
UNCOV
4798
                return gpuv1.Ready, nil
×
UNCOV
4799
        } else if err != nil {
×
UNCOV
4800
                return gpuv1.NotReady, err
×
UNCOV
4801
        }
×
4802

4803
        logger.Info("Found Resource, updating...")
×
4804
        obj.ResourceVersion = found.ResourceVersion
×
4805

×
4806
        err = n.client.Update(ctx, obj)
×
4807
        if err != nil {
×
4808
                logger.Info("Couldn't update", "Error", err)
×
UNCOV
4809
                return gpuv1.NotReady, err
×
UNCOV
4810
        }
×
4811
        return gpuv1.Ready, nil
×
4812
}
4813

4814
func transformRuntimeClassLegacy(n ClusterPolicyController, spec nodev1.RuntimeClass) (gpuv1.State, error) {
×
UNCOV
4815
        ctx := n.ctx
×
UNCOV
4816
        obj := &nodev1beta1.RuntimeClass{}
×
4817

×
4818
        obj.Name = spec.Name
×
4819
        obj.Handler = spec.Handler
×
UNCOV
4820

×
4821
        // apply runtime class name as per ClusterPolicy
×
4822
        if obj.Name == "FILLED_BY_OPERATOR" {
×
4823
                runtimeClassName := getRuntimeClass(&n.singleton.Spec)
×
UNCOV
4824
                obj.Name = runtimeClassName
×
4825
                obj.Handler = runtimeClassName
×
4826
        }
×
4827

4828
        obj.Labels = spec.Labels
×
UNCOV
4829

×
4830
        logger := n.logger.WithValues("RuntimeClass", obj.Name)
×
4831

×
4832
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
×
4833
                return gpuv1.NotReady, err
×
4834
        }
×
4835

UNCOV
4836
        found := &nodev1beta1.RuntimeClass{}
×
4837
        err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found)
×
UNCOV
4838
        if err != nil && apierrors.IsNotFound(err) {
×
UNCOV
4839
                logger.Info("Not found, creating...")
×
4840
                err = n.client.Create(ctx, obj)
×
4841
                if err != nil {
×
4842
                        logger.Info("Couldn't create", "Error", err)
×
4843
                        return gpuv1.NotReady, err
×
4844
                }
×
4845
                return gpuv1.Ready, nil
×
4846
        } else if err != nil {
×
UNCOV
4847
                return gpuv1.NotReady, err
×
UNCOV
4848
        }
×
4849

4850
        logger.Info("Found Resource, updating...")
×
4851
        obj.ResourceVersion = found.ResourceVersion
×
UNCOV
4852

×
4853
        err = n.client.Update(ctx, obj)
×
UNCOV
4854
        if err != nil {
×
UNCOV
4855
                logger.Info("Couldn't update", "Error", err)
×
4856
                return gpuv1.NotReady, err
×
4857
        }
×
4858
        return gpuv1.Ready, nil
×
4859
}
4860

4861
func transformRuntimeClass(n ClusterPolicyController, spec nodev1.RuntimeClass) (gpuv1.State, error) {
×
4862
        ctx := n.ctx
×
4863
        obj := &nodev1.RuntimeClass{}
×
4864

×
4865
        obj.Name = spec.Name
×
4866
        obj.Handler = spec.Handler
×
4867

×
4868
        // apply runtime class name as per ClusterPolicy
×
4869
        if obj.Name == "FILLED_BY_OPERATOR" {
×
4870
                runtimeClassName := getRuntimeClass(&n.singleton.Spec)
×
4871
                obj.Name = runtimeClassName
×
4872
                obj.Handler = runtimeClassName
×
UNCOV
4873
        }
×
4874

4875
        obj.Labels = spec.Labels
×
4876

×
4877
        logger := n.logger.WithValues("RuntimeClass", obj.Name)
×
4878

×
4879
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
×
4880
                return gpuv1.NotReady, err
×
4881
        }
×
4882

UNCOV
4883
        found := &nodev1.RuntimeClass{}
×
UNCOV
4884
        err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found)
×
4885
        if err != nil && apierrors.IsNotFound(err) {
×
4886
                logger.Info("Not found, creating...")
×
4887
                err = n.client.Create(ctx, obj)
×
4888
                if err != nil {
×
4889
                        logger.Info("Couldn't create", "Error", err)
×
4890
                        return gpuv1.NotReady, err
×
4891
                }
×
4892
                return gpuv1.Ready, nil
×
4893
        } else if err != nil {
×
4894
                return gpuv1.NotReady, err
×
4895
        }
×
4896

4897
        logger.Info("Found Resource, updating...")
×
UNCOV
4898
        obj.ResourceVersion = found.ResourceVersion
×
4899

×
4900
        err = n.client.Update(ctx, obj)
×
4901
        if err != nil {
×
4902
                logger.Info("Couldn't update", "Error", err)
×
4903
                return gpuv1.NotReady, err
×
4904
        }
×
4905
        return gpuv1.Ready, nil
×
4906
}
4907

4908
func transformKataRuntimeClasses(n ClusterPolicyController) (gpuv1.State, error) {
×
4909
        ctx := n.ctx
×
4910
        state := n.idx
×
4911
        config := n.singleton.Spec
×
4912

×
4913
        // Get all existing Kata RuntimeClasses
×
4914
        opts := []client.ListOption{&client.MatchingLabels{"nvidia.com/kata-runtime-class": "true"}}
×
4915
        list := &nodev1.RuntimeClassList{}
×
4916
        err := n.client.List(ctx, list, opts...)
×
4917
        if err != nil {
×
4918
                n.logger.Info("Could not get Kata RuntimeClassList", err)
×
4919
                return gpuv1.NotReady, fmt.Errorf("error getting kata RuntimeClassList: %v", err)
×
UNCOV
4920
        }
×
4921
        n.logger.V(1).Info("Kata RuntimeClasses", "Number", len(list.Items))
×
4922

×
4923
        if !config.KataManager.IsEnabled() {
×
4924
                // Delete all Kata RuntimeClasses
×
4925
                n.logger.Info("Kata Manager disabled, deleting all Kata RuntimeClasses")
×
4926
                for _, rc := range list.Items {
×
4927
                        rc := rc
×
4928
                        n.logger.V(1).Info("Deleting Kata RuntimeClass", "Name", rc.Name)
×
4929
                        err := n.client.Delete(ctx, &rc)
×
UNCOV
4930
                        if err != nil {
×
UNCOV
4931
                                return gpuv1.NotReady, fmt.Errorf("error deleting kata RuntimeClass '%s': %v", rc.Name, err)
×
4932
                        }
×
4933
                }
4934
                return gpuv1.Ready, nil
×
4935
        }
4936

4937
        // Get names of desired kata RuntimeClasses
4938
        rcNames := make(map[string]struct{})
×
4939
        for _, rc := range config.KataManager.Config.RuntimeClasses {
×
4940
                rcNames[rc.Name] = struct{}{}
×
4941
        }
×
4942

4943
        // Delete any existing Kata RuntimeClasses that are no longer specified in KataManager configuration
4944
        for _, rc := range list.Items {
×
UNCOV
4945
                if _, ok := rcNames[rc.Name]; !ok {
×
4946
                        rc := rc
×
4947
                        n.logger.Info("Deleting Kata RuntimeClass", "Name", rc.Name)
×
4948
                        err := n.client.Delete(ctx, &rc)
×
4949
                        if err != nil {
×
4950
                                return gpuv1.NotReady, fmt.Errorf("error deleting kata RuntimeClass '%s': %v", rc.Name, err)
×
4951
                        }
×
4952
                }
4953
        }
4954

4955
        // Using kata RuntimClass template, create / update RuntimeClass objects specified in KataManager configuration
4956
        template := n.resources[state].RuntimeClasses[0]
×
4957
        for _, rc := range config.KataManager.Config.RuntimeClasses {
×
4958
                logger := n.logger.WithValues("RuntimeClass", rc.Name)
×
4959

×
4960
                if rc.Name == config.Operator.RuntimeClass {
×
4961
                        return gpuv1.NotReady, fmt.Errorf("error creating kata runtimeclass '%s' as it conflicts with the runtimeclass used for the gpu-operator operand pods itself", rc.Name)
×
4962
                }
×
4963

4964
                obj := nodev1.RuntimeClass{}
×
4965
                obj.Name = rc.Name
×
4966
                obj.Handler = rc.Name
×
UNCOV
4967
                obj.Labels = template.Labels
×
4968
                obj.Scheduling = &nodev1.Scheduling{}
×
4969
                nodeSelector := make(map[string]string)
×
4970
                for k, v := range template.Scheduling.NodeSelector {
×
4971
                        nodeSelector[k] = v
×
4972
                }
×
4973
                if rc.NodeSelector != nil {
×
4974
                        // append user provided selectors to default nodeSelector
×
4975
                        for k, v := range rc.NodeSelector {
×
4976
                                nodeSelector[k] = v
×
UNCOV
4977
                        }
×
4978
                }
4979
                obj.Scheduling.NodeSelector = nodeSelector
×
4980

×
4981
                if err := controllerutil.SetControllerReference(n.singleton, &obj, n.scheme); err != nil {
×
4982
                        return gpuv1.NotReady, err
×
4983
                }
×
4984

4985
                found := &nodev1.RuntimeClass{}
×
4986
                err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found)
×
4987
                if err != nil && apierrors.IsNotFound(err) {
×
4988
                        logger.Info("Not found, creating...")
×
4989
                        err = n.client.Create(ctx, &obj)
×
4990
                        if err != nil {
×
4991
                                logger.Info("Couldn't create", "Error", err)
×
4992
                                return gpuv1.NotReady, err
×
4993
                        }
×
4994
                        continue
×
4995
                } else if err != nil {
×
4996
                        return gpuv1.NotReady, err
×
4997
                }
×
4998

4999
                logger.Info("Found Resource, updating...")
×
5000
                obj.ResourceVersion = found.ResourceVersion
×
5001

×
5002
                err = n.client.Update(ctx, &obj)
×
5003
                if err != nil {
×
UNCOV
5004
                        logger.Info("Couldn't update", "Error", err)
×
5005
                        return gpuv1.NotReady, err
×
UNCOV
5006
                }
×
5007
        }
UNCOV
5008
        return gpuv1.Ready, nil
×
5009
}
5010

5011
func RuntimeClasses(n ClusterPolicyController) (gpuv1.State, error) {
×
5012
        status := gpuv1.Ready
×
UNCOV
5013
        state := n.idx
×
UNCOV
5014

×
5015
        if n.stateNames[state] == "state-kata-manager" {
×
5016
                return transformKataRuntimeClasses(n)
×
5017
        }
×
5018

5019
        createRuntimeClassFunc := transformRuntimeClass
×
5020
        if semver.Compare(n.k8sVersion, nodev1MinimumAPIVersion) <= 0 {
×
5021
                createRuntimeClassFunc = transformRuntimeClassLegacy
×
5022
        }
×
5023

UNCOV
5024
        for _, obj := range n.resources[state].RuntimeClasses {
×
UNCOV
5025
                obj := obj
×
UNCOV
5026
                // When CDI is disabled, do not create the additional 'nvidia-cdi' and
×
5027
                // 'nvidia-legacy' runtime classes. Delete these objects if they were
×
5028
                // previously created.
×
5029
                if !n.singleton.Spec.CDI.IsEnabled() && (obj.Name == "nvidia-cdi" || obj.Name == "nvidia-legacy") {
×
5030
                        err := n.client.Delete(n.ctx, &obj)
×
5031
                        if err != nil && !apierrors.IsNotFound(err) {
×
5032
                                n.logger.Info("Couldn't delete", "RuntimeClass", obj.Name, "Error", err)
×
5033
                                return gpuv1.NotReady, err
×
UNCOV
5034
                        }
×
5035
                        continue
×
5036
                }
5037
                stat, err := createRuntimeClassFunc(n, obj)
×
5038
                if err != nil {
×
5039
                        return stat, err
×
5040
                }
×
5041
                if stat != gpuv1.Ready {
×
5042
                        status = gpuv1.NotReady
×
5043
                }
×
5044
        }
5045
        return status, nil
×
5046
}
5047

5048
// PrometheusRule creates PrometheusRule object
UNCOV
5049
func PrometheusRule(n ClusterPolicyController) (gpuv1.State, error) {
×
5050
        ctx := n.ctx
×
5051
        state := n.idx
×
5052
        obj := n.resources[state].PrometheusRule.DeepCopy()
×
5053
        obj.Namespace = n.operatorNamespace
×
5054

×
UNCOV
5055
        logger := n.logger.WithValues("PrometheusRule", obj.Name)
×
5056

×
5057
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
×
5058
                return gpuv1.NotReady, err
×
5059
        }
×
5060

5061
        found := &promv1.PrometheusRule{}
×
5062
        err := n.client.Get(ctx, types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
×
5063
        if err != nil && apierrors.IsNotFound(err) {
×
5064
                logger.Info("Not found, creating...")
×
5065
                err = n.client.Create(ctx, obj)
×
5066
                if err != nil {
×
5067
                        logger.Info("Couldn't create", "Error", err)
×
5068
                        return gpuv1.NotReady, err
×
UNCOV
5069
                }
×
5070
                return gpuv1.Ready, nil
×
5071
        } else if err != nil {
×
5072
                return gpuv1.NotReady, err
×
5073
        }
×
5074

5075
        logger.Info("Found Resource, updating...")
×
5076
        obj.ResourceVersion = found.ResourceVersion
×
5077

×
UNCOV
5078
        err = n.client.Update(ctx, obj)
×
5079
        if err != nil {
×
UNCOV
5080
                logger.Info("Couldn't update", "Error", err)
×
UNCOV
5081
                return gpuv1.NotReady, err
×
5082
        }
×
5083
        return gpuv1.Ready, nil
×
5084
}
5085

NEW
5086
func createDeviceClass(n ClusterPolicyController, spec resourceapi.DeviceClass) (gpuv1.State, error) {
×
NEW
5087
        ctx := n.ctx
×
NEW
5088
        state := n.idx
×
NEW
UNCOV
5089
        obj := spec.DeepCopy()
×
NEW
5090

×
NEW
5091
        logger := n.logger.WithValues("DeviceClass", obj.Name)
×
NEW
5092

×
NEW
5093
        // Check if state is disabled and cleanup resource if exists
×
NEW
UNCOV
5094
        if !n.isStateEnabled(n.stateNames[state]) ||
×
NEW
5095
                (strings.Contains(obj.Name, "compute-domain") && !n.singleton.Spec.DRADriver.IsComputeDomainsEnabled()) ||
×
NEW
5096
                (obj.Name == "gpu.nvidia.com" && !n.singleton.Spec.DRADriver.IsGPUsEnabled()) ||
×
NEW
5097
                (obj.Name == "mig.nvidia.com" && !n.singleton.Spec.DRADriver.IsGPUsEnabled()) {
×
NEW
5098
                err := n.client.Delete(ctx, obj)
×
NEW
5099
                if err != nil && !apierrors.IsNotFound(err) {
×
NEW
5100
                        logger.Info("Couldn't delete", "Error", err)
×
NEW
5101
                        return gpuv1.NotReady, err
×
NEW
5102
                }
×
NEW
5103
                return gpuv1.Disabled, nil
×
5104
        }
5105

NEW
5106
        if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
×
NEW
UNCOV
5107
                return gpuv1.NotReady, err
×
NEW
5108
        }
×
5109

NEW
5110
        found := &resourceapi.DeviceClass{}
×
NEW
5111
        err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found)
×
NEW
5112
        if err != nil && apierrors.IsNotFound(err) {
×
NEW
5113
                logger.Info("Not found, creating...")
×
NEW
5114
                err = n.client.Create(ctx, obj)
×
NEW
UNCOV
5115
                if err != nil {
×
NEW
5116
                        logger.Info("Couldn't create", "Error", err)
×
NEW
UNCOV
5117
                        return gpuv1.NotReady, err
×
NEW
UNCOV
5118
                }
×
NEW
UNCOV
5119
                return gpuv1.Ready, nil
×
NEW
5120
        } else if err != nil {
×
NEW
5121
                return gpuv1.NotReady, err
×
NEW
5122
        }
×
5123

NEW
5124
        logger.Info("Found Resource, updating...")
×
NEW
5125
        obj.ResourceVersion = found.ResourceVersion
×
NEW
5126

×
NEW
5127
        err = n.client.Update(ctx, obj)
×
NEW
5128
        if err != nil {
×
NEW
5129
                logger.Info("Couldn't update", "Error", err)
×
NEW
5130
                return gpuv1.NotReady, err
×
NEW
UNCOV
5131
        }
×
NEW
5132
        return gpuv1.Ready, nil
×
5133
}
5134

5135
// DeviceClasses creates DeviceClass objects
NEW
5136
func DeviceClasses(n ClusterPolicyController) (gpuv1.State, error) {
×
NEW
5137
        status := gpuv1.Ready
×
NEW
5138
        state := n.idx
×
NEW
5139

×
NEW
5140
        for _, obj := range n.resources[state].DeviceClasses {
×
NEW
5141
                obj := obj
×
NEW
5142
                stat, err := createDeviceClass(n, obj)
×
NEW
5143
                if err != nil {
×
NEW
5144
                        return stat, err
×
NEW
UNCOV
5145
                }
×
5146

NEW
5147
                switch stat {
×
NEW
5148
                case gpuv1.Ready:
×
NEW
5149
                        continue
×
NEW
5150
                case gpuv1.Disabled:
×
NEW
5151
                        continue
×
NEW
5152
                default:
×
NEW
5153
                        status = gpuv1.NotReady
×
5154
                }
5155
        }
NEW
5156
        return status, nil
×
5157
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc