• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

NVIDIA / gpu-operator / 20858835142

09 Jan 2026 04:42PM UTC coverage: 23.925% (+0.3%) from 23.613%
20858835142

push

github

karthikvetrivel
Add driver configuration digest computation and driver type support

Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>

53 of 90 new or added lines in 3 files covered. (58.89%)

1 existing line in 1 file now uncovered.

2850 of 11912 relevant lines covered (23.93%)

0.27 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/cmd/nvidia-validator/main.go
1
/*
2
 * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package main
18

19
import (
20
        "context"
21
        "fmt"
22
        "os"
23
        "os/exec"
24
        "os/signal"
25
        "path/filepath"
26
        "regexp"
27
        "strings"
28
        "syscall"
29
        "time"
30

31
        "github.com/NVIDIA/go-nvlib/pkg/nvmdev"
32
        "github.com/NVIDIA/go-nvlib/pkg/nvpci"
33
        devchar "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/system/create-dev-char-symlinks"
34
        log "github.com/sirupsen/logrus"
35
        cli "github.com/urfave/cli/v3"
36
        corev1 "k8s.io/api/core/v1"
37
        "k8s.io/apimachinery/pkg/api/resource"
38
        meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
39
        "k8s.io/apimachinery/pkg/fields"
40
        "k8s.io/apimachinery/pkg/labels"
41
        "k8s.io/apimachinery/pkg/runtime/serializer/json"
42
        "k8s.io/apimachinery/pkg/util/wait"
43
        "k8s.io/client-go/kubernetes"
44
        "k8s.io/client-go/kubernetes/scheme"
45
        "k8s.io/client-go/rest"
46

47
        nvidiav1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
48
        nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
49
        "github.com/NVIDIA/gpu-operator/internal/info"
50
)
51

52
// Component of GPU operator
53
type Component interface {
54
        validate() error
55
        createStatusFile() error
56
        deleteStatusFile() error
57
}
58

59
// Driver component
60
type Driver struct {
61
        ctx context.Context
62
}
63

64
// NvidiaFs GDS Driver component
65
type NvidiaFs struct{}
66

67
// GDRCopy driver component
68
type GDRCopy struct{}
69

70
// CUDA represents spec to run cuda workload
71
type CUDA struct {
72
        ctx        context.Context
73
        kubeClient kubernetes.Interface
74
}
75

76
// Plugin component
77
type Plugin struct {
78
        ctx        context.Context
79
        kubeClient kubernetes.Interface
80
}
81

82
// Toolkit component
83
type Toolkit struct{}
84

85
// MOFED represents spec to validate MOFED driver installation
86
type MOFED struct {
87
        ctx        context.Context
88
        kubeClient kubernetes.Interface
89
}
90

91
// Metrics represents spec to run metrics exporter
92
type Metrics struct {
93
        ctx context.Context
94
}
95

96
// VfioPCI represents spec to validate vfio-pci driver
97
type VfioPCI struct {
98
        ctx context.Context
99
}
100

101
// VGPUManager represents spec to validate vGPU Manager installation
102
type VGPUManager struct {
103
        ctx context.Context
104
}
105

106
// VGPUDevices represents spec to validate vGPU device creation
107
type VGPUDevices struct {
108
        ctx context.Context
109
}
110

111
// CCManager represents spec to validate CC Manager installation
112
type CCManager struct {
113
        ctx        context.Context
114
        kubeClient kubernetes.Interface
115
}
116

117
var (
118
        kubeconfigFlag                string
119
        nodeNameFlag                  string
120
        namespaceFlag                 string
121
        withWaitFlag                  bool
122
        withWorkloadFlag              bool
123
        componentFlag                 string
124
        cleanupAllFlag                bool
125
        outputDirFlag                 string
126
        sleepIntervalSecondsFlag      int
127
        migStrategyFlag               string
128
        metricsPort                   int
129
        defaultGPUWorkloadConfigFlag  string
130
        disableDevCharSymlinkCreation bool
131
        hostRootFlag                  string
132
        driverInstallDirFlag          string
133
        driverInstallDirCtrPathFlag   string
134
)
135

136
// defaultGPUWorkloadConfig is "vm-passthrough" unless
137
// overridden by defaultGPUWorkloadConfigFlag
138
var defaultGPUWorkloadConfig = gpuWorkloadConfigVMPassthrough
139

140
const (
141
        // defaultStatusPath indicates directory to create all validation status files
142
        defaultStatusPath = "/run/nvidia/validations"
143
        // defaultSleepIntervalSeconds indicates sleep interval in seconds between validation command retries
144
        defaultSleepIntervalSeconds = 5
145
        // defaultMetricsPort indicates the port on which the metrics will be exposed.
146
        defaultMetricsPort = 0
147
        // hostDevCharPath indicates the path in the container where the host '/dev/char' directory is mounted to
148
        hostDevCharPath = "/host-dev-char"
149
        // defaultDriverInstallDir indicates the default path on the host where the driver container installation is made available
150
        defaultDriverInstallDir = "/run/nvidia/driver"
151
        // defaultDriverInstallDirCtrPath indicates the default path where the NVIDIA driver install dir is mounted in the container
152
        defaultDriverInstallDirCtrPath = "/run/nvidia/driver"
153
        // driverStatusFile indicates status file for containerizeddriver readiness
154
        driverStatusFile = "driver-ready"
155
        // nvidiaFsStatusFile indicates status file for nvidia-fs driver readiness
156
        nvidiaFsStatusFile = "nvidia-fs-ready"
157
        // gdrCopyStatusFile indicates status file for GDRCopy driver (gdrdrv) readiness
158
        gdrCopyStatusFile = "gdrcopy-ready"
159
        // toolkitStatusFile indicates status file for toolkit readiness
160
        toolkitStatusFile = "toolkit-ready"
161
        // pluginStatusFile indicates status file for plugin readiness
162
        pluginStatusFile = "plugin-ready"
163
        // cudaStatusFile indicates status file for cuda readiness
164
        cudaStatusFile = "cuda-ready"
165
        // mofedStatusFile indicates status file for mofed driver readiness
166
        mofedStatusFile = "mofed-ready"
167
        // vfioPCIStatusFile indicates status file for vfio-pci driver readiness
168
        vfioPCIStatusFile = "vfio-pci-ready"
169
        // vGPUManagerStatusFile indicates status file for vGPU Manager driver readiness
170
        vGPUManagerStatusFile = "vgpu-manager-ready"
171
        // hostVGPUManagerStatusFile indicates status file for host vGPU Manager driver readiness
172
        hostVGPUManagerStatusFile = "host-vgpu-manager-ready"
173
        // vGPUDevicesStatusFile is name of the file which indicates vGPU Manager is installed and vGPU devices have been created
174
        vGPUDevicesStatusFile = "vgpu-devices-ready"
175
        // ccManagerStatusFile indicates status file for cc-manager readiness
176
        ccManagerStatusFile = "cc-manager-ready"
177
        // workloadTypeStatusFile is the name of the file which specifies the workload type configured for the node
178
        workloadTypeStatusFile = "workload-type"
179
        // podCreationWaitRetries indicates total retries to wait for plugin validation pod creation
180
        podCreationWaitRetries = 60
181
        // podCreationSleepIntervalSeconds indicates sleep interval in seconds between checking for plugin validation pod readiness
182
        podCreationSleepIntervalSeconds = 5
183
        // gpuResourceDiscoveryWaitRetries indicates total retries to wait for node to discovery GPU resources
184
        gpuResourceDiscoveryWaitRetries = 30
185
        // gpuResourceDiscoveryIntervalSeconds indicates sleep interval in seconds between checking for available GPU resources
186
        gpuResourceDiscoveryIntervalSeconds = 5
187
        // genericGPUResourceType indicates the generic name of the GPU exposed by NVIDIA DevicePlugin
188
        genericGPUResourceType = "nvidia.com/gpu"
189
        // migGPUResourcePrefix indicates the prefix of the MIG resources exposed by NVIDIA DevicePlugin
190
        migGPUResourcePrefix = "nvidia.com/mig-"
191
        // migStrategySingle indicates mixed MIG strategy
192
        migStrategySingle = "single"
193
        // pluginWorkloadPodSpecPath indicates path to plugin validation pod definition
194
        pluginWorkloadPodSpecPath = "/opt/validator/manifests/plugin-workload-validation.yaml"
195
        // cudaWorkloadPodSpecPath indicates path to cuda validation pod definition
196
        cudaWorkloadPodSpecPath = "/opt/validator/manifests/cuda-workload-validation.yaml"
197
        // validatorImageEnvName indicates env name for validator image passed
198
        validatorImageEnvName = "VALIDATOR_IMAGE"
199
        // validatorImagePullPolicyEnvName indicates env name for validator image pull policy passed
200
        validatorImagePullPolicyEnvName = "VALIDATOR_IMAGE_PULL_POLICY"
201
        // validatorImagePullSecretsEnvName indicates env name for validator image pull secrets passed
202
        validatorImagePullSecretsEnvName = "VALIDATOR_IMAGE_PULL_SECRETS"
203
        // validatorRuntimeClassEnvName indicates env name for validator runtimeclass passed
204
        validatorRuntimeClassEnvName = "VALIDATOR_RUNTIME_CLASS"
205
        // cudaValidatorLabelValue represents label for cuda workload validation pod
206
        cudaValidatorLabelValue = "nvidia-cuda-validator"
207
        // pluginValidatorLabelValue represents label for device-plugin workload validation pod
208
        pluginValidatorLabelValue = "nvidia-device-plugin-validator"
209
        // MellanoxDeviceLabelKey represents NFD label name for Mellanox devices
210
        MellanoxDeviceLabelKey = "feature.node.kubernetes.io/pci-15b3.present"
211
        // GPUDirectRDMAEnabledEnvName represents env name to indicate if GPUDirect RDMA is enabled through GPU Operator
212
        GPUDirectRDMAEnabledEnvName = "GPU_DIRECT_RDMA_ENABLED"
213
        // UseHostMOFEDEnvname represents env name to indicate if MOFED is pre-installed on host
214
        UseHostMOFEDEnvname = "USE_HOST_MOFED"
215
        // TODO: create a common package to share these variables between operator and validator
216
        gpuWorkloadConfigLabelKey      = "nvidia.com/gpu.workload.config"
217
        gpuWorkloadConfigContainer     = "container"
218
        gpuWorkloadConfigVMPassthrough = "vm-passthrough"
219
        gpuWorkloadConfigVMVgpu        = "vm-vgpu"
220
        // CCCapableLabelKey represents NFD label name to indicate if the node is capable to run CC workloads
221
        CCCapableLabelKey = "nvidia.com/cc.capable"
222
        // appComponentLabelKey indicates the label key of the component
223
        appComponentLabelKey = "app.kubernetes.io/component"
224
        // wslNvidiaSMIPath indicates the path to the nvidia-smi binary on WSL
225
        wslNvidiaSMIPath = "/usr/lib/wsl/lib/nvidia-smi"
226
        // shell indicates what shell to use when invoking commands in a subprocess
227
        shell = "sh"
228
        // defaultVFWaitTimeout is the default timeout for waiting for VFs to be created
229
        defaultVFWaitTimeout = 5 * time.Minute
230
)
231

232
func main() {
×
233
        c := cli.Command{}
×
234

×
235
        c.Before = validateFlags
×
236
        c.Action = start
×
237
        c.Version = info.GetVersionString()
×
238

×
239
        c.Flags = []cli.Flag{
×
240
                &cli.StringFlag{
×
241
                        Name:        "kubeconfig",
×
242
                        Value:       "",
×
243
                        Usage:       "absolute path to the kubeconfig file",
×
244
                        Destination: &kubeconfigFlag,
×
245
                        Sources:     cli.EnvVars("KUBECONFIG"),
×
246
                },
×
247
                &cli.StringFlag{
×
248
                        Name:        "node-name",
×
249
                        Aliases:     []string{"n"},
×
250
                        Value:       "",
×
251
                        Usage:       "the name of the node to deploy plugin validation pod",
×
252
                        Destination: &nodeNameFlag,
×
253
                        Sources:     cli.EnvVars("NODE_NAME"),
×
254
                },
×
255
                &cli.StringFlag{
×
256
                        Name:        "namespace",
×
257
                        Aliases:     []string{"ns"},
×
258
                        Value:       "",
×
259
                        Usage:       "the namespace in which the operator resources are deployed",
×
260
                        Destination: &namespaceFlag,
×
261
                        Sources:     cli.EnvVars("OPERATOR_NAMESPACE"),
×
262
                },
×
263
                &cli.BoolFlag{
×
264
                        Name:        "with-wait",
×
265
                        Aliases:     []string{"w"},
×
266
                        Value:       false,
×
267
                        Usage:       "indicates to wait for validation to complete successfully",
×
268
                        Destination: &withWaitFlag,
×
269
                        Sources:     cli.EnvVars("WITH_WAIT"),
×
270
                },
×
271
                &cli.BoolFlag{
×
272
                        Name:        "with-workload",
×
273
                        Aliases:     []string{"l"},
×
274
                        Value:       true,
×
275
                        Usage:       "indicates to validate with GPU workload",
×
276
                        Destination: &withWorkloadFlag,
×
277
                        Sources:     cli.EnvVars("WITH_WORKLOAD"),
×
278
                },
×
279
                &cli.StringFlag{
×
280
                        Name:        "component",
×
281
                        Aliases:     []string{"c"},
×
282
                        Value:       "",
×
283
                        Usage:       "the name of the operator component to validate",
×
284
                        Destination: &componentFlag,
×
285
                        Sources:     cli.EnvVars("COMPONENT"),
×
286
                },
×
287
                &cli.BoolFlag{
×
288
                        Name:        "cleanup-all",
×
289
                        Aliases:     []string{"r"},
×
290
                        Value:       false,
×
291
                        Usage:       "indicates to cleanup all previous validation status files",
×
292
                        Destination: &cleanupAllFlag,
×
293
                        Sources:     cli.EnvVars("CLEANUP_ALL"),
×
294
                },
×
295
                &cli.StringFlag{
×
296
                        Name:        "output-dir",
×
297
                        Aliases:     []string{"o"},
×
298
                        Value:       defaultStatusPath,
×
299
                        Usage:       "output directory where all validation status files are created",
×
300
                        Destination: &outputDirFlag,
×
301
                        Sources:     cli.EnvVars("OUTPUT_DIR"),
×
302
                },
×
303
                &cli.IntFlag{
×
304
                        Name:        "sleep-interval-seconds",
×
305
                        Aliases:     []string{"s"},
×
306
                        Value:       defaultSleepIntervalSeconds,
×
307
                        Usage:       "sleep interval in seconds between command retries",
×
308
                        Destination: &sleepIntervalSecondsFlag,
×
309
                        Sources:     cli.EnvVars("SLEEP_INTERVAL_SECONDS"),
×
310
                },
×
311
                &cli.StringFlag{
×
312
                        Name:        "mig-strategy",
×
313
                        Aliases:     []string{"m"},
×
314
                        Value:       migStrategySingle,
×
315
                        Usage:       "MIG Strategy",
×
316
                        Destination: &migStrategyFlag,
×
317
                        Sources:     cli.EnvVars("MIG_STRATEGY"),
×
318
                },
×
319
                &cli.IntFlag{
×
320
                        Name:        "metrics-port",
×
321
                        Aliases:     []string{"p"},
×
322
                        Value:       defaultMetricsPort,
×
323
                        Usage:       "port on which the metrics will be exposed. 0 means disabled.",
×
324
                        Destination: &metricsPort,
×
325
                        Sources:     cli.EnvVars("METRICS_PORT"),
×
326
                },
×
327
                &cli.StringFlag{
×
328
                        Name:        "default-gpu-workload-config",
×
329
                        Aliases:     []string{"g"},
×
330
                        Value:       "",
×
331
                        Usage:       "default GPU workload config. determines what components to validate by default when sandbox workloads are enabled in the cluster.",
×
332
                        Destination: &defaultGPUWorkloadConfigFlag,
×
333
                        Sources:     cli.EnvVars("DEFAULT_GPU_WORKLOAD_CONFIG"),
×
334
                },
×
335
                &cli.BoolFlag{
×
336
                        Name:        "disable-dev-char-symlink-creation",
×
337
                        Value:       false,
×
338
                        Usage:       "disable creation of symlinks under /dev/char corresponding to NVIDIA character devices",
×
339
                        Destination: &disableDevCharSymlinkCreation,
×
340
                        Sources:     cli.EnvVars("DISABLE_DEV_CHAR_SYMLINK_CREATION"),
×
341
                },
×
342
                &cli.StringFlag{
×
343
                        Name:        "host-root",
×
344
                        Value:       "/",
×
345
                        Usage:       "root path of the underlying host",
×
346
                        Destination: &hostRootFlag,
×
347
                        Sources:     cli.EnvVars("HOST_ROOT"),
×
348
                },
×
349
                &cli.StringFlag{
×
350
                        Name:        "driver-install-dir",
×
351
                        Value:       defaultDriverInstallDir,
×
352
                        Usage:       "the path on the host where a containerized NVIDIA driver installation is made available",
×
353
                        Destination: &driverInstallDirFlag,
×
354
                        Sources:     cli.EnvVars("DRIVER_INSTALL_DIR"),
×
355
                },
×
356
                &cli.StringFlag{
×
357
                        Name:        "driver-install-dir-ctr-path",
×
358
                        Value:       defaultDriverInstallDirCtrPath,
×
359
                        Usage:       "the path where the NVIDIA driver install dir is mounted in the container",
×
360
                        Destination: &driverInstallDirCtrPathFlag,
×
361
                        Sources:     cli.EnvVars("DRIVER_INSTALL_DIR_CTR_PATH"),
×
362
                },
×
363
        }
×
364

×
365
        // Log version info
×
366
        log.Infof("version: %s", c.Version)
×
367

×
368
        // Handle signals
×
369
        go handleSignal()
×
370

×
371
        // invoke command
×
372
        err := c.Run(context.Background(), os.Args)
×
373
        if err != nil {
×
374
                log.SetOutput(os.Stderr)
×
375
                log.Printf("Error: %v", err)
×
376
                os.Exit(1)
×
377
        }
×
378
}
379

380
func handleSignal() {
×
381
        // Handle signals
×
382
        stop := make(chan os.Signal, 1)
×
383
        signal.Notify(stop, os.Interrupt,
×
384
                syscall.SIGTERM, syscall.SIGHUP, syscall.SIGINT, syscall.SIGQUIT)
×
385

×
386
        s := <-stop
×
387
        log.Fatalf("Exiting due to signal [%v] notification for pid [%d]", s.String(), os.Getpid())
×
388
}
×
389

390
func validateFlags(ctx context.Context, cli *cli.Command) (context.Context, error) {
×
391
        if componentFlag == "" {
×
392
                return ctx, fmt.Errorf("invalid -c <component-name> flag: must not be empty string")
×
393
        }
×
394
        if !isValidComponent() {
×
395
                return ctx, fmt.Errorf("invalid -c <component-name> flag value: %s", componentFlag)
×
396
        }
×
397
        if componentFlag == "plugin" {
×
398
                if nodeNameFlag == "" {
×
399
                        return ctx, fmt.Errorf("invalid -n <node-name> flag: must not be empty string for plugin validation")
×
400
                }
×
401
                if namespaceFlag == "" {
×
402
                        return ctx, fmt.Errorf("invalid -ns <namespace> flag: must not be empty string for plugin validation")
×
403
                }
×
404
        }
405
        if componentFlag == "cuda" && namespaceFlag == "" {
×
406
                return ctx, fmt.Errorf("invalid -ns <namespace> flag: must not be empty string for cuda validation")
×
407
        }
×
408
        if componentFlag == "metrics" {
×
409
                if metricsPort == defaultMetricsPort {
×
410
                        return ctx, fmt.Errorf("invalid -p <port> flag: must not be empty or 0 for the metrics component")
×
411
                }
×
412
                if nodeNameFlag == "" {
×
413
                        return ctx, fmt.Errorf("invalid -n <node-name> flag: must not be empty string for metrics exporter")
×
414
                }
×
415
        }
416
        if nodeNameFlag == "" && (componentFlag == "vfio-pci" || componentFlag == "vgpu-manager" || componentFlag == "vgpu-devices") {
×
417
                return ctx, fmt.Errorf("invalid -n <node-name> flag: must not be empty string for %s validation", componentFlag)
×
418
        }
×
419

420
        return ctx, nil
×
421
}
422

423
func isValidComponent() bool {
×
424
        switch componentFlag {
×
425
        case "driver":
×
426
                fallthrough
×
427
        case "toolkit":
×
428
                fallthrough
×
429
        case "cuda":
×
430
                fallthrough
×
431
        case "metrics":
×
432
                fallthrough
×
433
        case "plugin":
×
434
                fallthrough
×
435
        case "mofed":
×
436
                fallthrough
×
437
        case "vfio-pci":
×
438
                fallthrough
×
439
        case "vgpu-manager":
×
440
                fallthrough
×
441
        case "vgpu-devices":
×
442
                fallthrough
×
443
        case "cc-manager":
×
444
                fallthrough
×
445
        case "nvidia-fs":
×
446
                fallthrough
×
447
        case "gdrcopy":
×
448
                return true
×
449
        default:
×
450
                return false
×
451
        }
452
}
453

454
func isValidWorkloadConfig(config string) bool {
×
455
        return config == gpuWorkloadConfigContainer ||
×
456
                config == gpuWorkloadConfigVMPassthrough ||
×
457
                config == gpuWorkloadConfigVMVgpu
×
458
}
×
459

460
func getWorkloadConfig(ctx context.Context) (string, error) {
×
461
        // check if default workload is overridden by flag
×
462
        if isValidWorkloadConfig(defaultGPUWorkloadConfigFlag) {
×
463
                defaultGPUWorkloadConfig = defaultGPUWorkloadConfigFlag
×
464
        }
×
465

466
        kubeConfig, err := rest.InClusterConfig()
×
467
        if err != nil {
×
468
                return "", fmt.Errorf("error getting cluster config - %s", err.Error())
×
469
        }
×
470

471
        kubeClient, err := kubernetes.NewForConfig(kubeConfig)
×
472
        if err != nil {
×
473
                return "", fmt.Errorf("error getting k8s client - %w", err)
×
474
        }
×
475

476
        node, err := getNode(ctx, kubeClient)
×
477
        if err != nil {
×
478
                return "", fmt.Errorf("error getting node labels - %w", err)
×
479
        }
×
480

481
        labels := node.GetLabels()
×
482
        value, ok := labels[gpuWorkloadConfigLabelKey]
×
483
        if !ok {
×
484
                log.Infof("No %s label found; using default workload config: %s", gpuWorkloadConfigLabelKey, defaultGPUWorkloadConfig)
×
485
                return defaultGPUWorkloadConfig, nil
×
486
        }
×
487
        if !isValidWorkloadConfig(value) {
×
488
                log.Warnf("%s is an invalid workload config; using default workload config: %s", value, defaultGPUWorkloadConfig)
×
489
                return defaultGPUWorkloadConfig, nil
×
490
        }
×
491
        return value, nil
×
492
}
493

494
func start(ctx context.Context, cli *cli.Command) error {
×
495
        // if cleanup is requested, delete all existing status files(default)
×
496
        if cleanupAllFlag {
×
497
                // cleanup output directory and create again each time
×
498
                err := os.RemoveAll(outputDirFlag)
×
499
                if err != nil {
×
500
                        if !os.IsNotExist(err) {
×
501
                                return err
×
502
                        }
×
503
                }
504
        }
505

506
        // create status directory
507
        err := os.Mkdir(outputDirFlag, 0755)
×
508
        if err != nil && !os.IsExist(err) {
×
509
                return err
×
510
        }
×
511

512
        switch componentFlag {
×
513
        case "driver":
×
514
                driver := &Driver{
×
515
                        ctx: ctx,
×
516
                }
×
517
                err := driver.validate()
×
518
                if err != nil {
×
519
                        return fmt.Errorf("error validating driver installation: %w", err)
×
520
                }
×
521
                return nil
×
522
        case "nvidia-fs":
×
523
                nvidiaFs := &NvidiaFs{}
×
524
                err := nvidiaFs.validate()
×
525
                if err != nil {
×
526
                        return fmt.Errorf("error validating nvidia-fs driver installation: %w", err)
×
527
                }
×
528
                return nil
×
529
        case "gdrcopy":
×
530
                gdrcopy := &GDRCopy{}
×
531
                err := gdrcopy.validate()
×
532
                if err != nil {
×
533
                        return fmt.Errorf("error validating gdrcopy driver installation: %w", err)
×
534
                }
×
535
                return nil
×
536
        case "toolkit":
×
537
                toolkit := &Toolkit{}
×
538
                err := toolkit.validate()
×
539
                if err != nil {
×
540
                        return fmt.Errorf("error validating toolkit installation: %w", err)
×
541
                }
×
542
                return nil
×
543
        case "cuda":
×
544
                cuda := &CUDA{
×
545
                        ctx: ctx,
×
546
                }
×
547
                err := cuda.validate()
×
548
                if err != nil {
×
549
                        return fmt.Errorf("error validating cuda workload: %w", err)
×
550
                }
×
551
                return nil
×
552
        case "plugin":
×
553
                plugin := &Plugin{
×
554
                        ctx: ctx,
×
555
                }
×
556
                err := plugin.validate()
×
557
                if err != nil {
×
558
                        return fmt.Errorf("error validating plugin installation: %w", err)
×
559
                }
×
560
                return nil
×
561
        case "mofed":
×
562
                mofed := &MOFED{
×
563
                        ctx: ctx,
×
564
                }
×
565
                err := mofed.validate()
×
566
                if err != nil {
×
567
                        return fmt.Errorf("error validating MOFED driver installation: %s", err)
×
568
                }
×
569
                return nil
×
570
        case "metrics":
×
571
                metrics := &Metrics{
×
572
                        ctx: ctx,
×
573
                }
×
574
                err := metrics.run()
×
575
                if err != nil {
×
576
                        return fmt.Errorf("error running validation-metrics exporter: %s", err)
×
577
                }
×
578
                return nil
×
579
        case "vfio-pci":
×
580
                vfioPCI := &VfioPCI{
×
581
                        ctx: ctx,
×
582
                }
×
583
                err := vfioPCI.validate()
×
584
                if err != nil {
×
585
                        return fmt.Errorf("error validating vfio-pci driver installation: %w", err)
×
586
                }
×
587
                return nil
×
588
        case "vgpu-manager":
×
589
                vGPUManager := &VGPUManager{
×
590
                        ctx: ctx,
×
591
                }
×
592
                err := vGPUManager.validate()
×
593
                if err != nil {
×
594
                        return fmt.Errorf("error validating vGPU Manager installation: %w", err)
×
595
                }
×
596
                return nil
×
597
        case "vgpu-devices":
×
598
                vGPUDevices := &VGPUDevices{
×
599
                        ctx: ctx,
×
600
                }
×
601
                err := vGPUDevices.validate()
×
602
                if err != nil {
×
603
                        return fmt.Errorf("error validating vGPU devices: %s", err)
×
604
                }
×
605
                return nil
×
606
        case "cc-manager":
×
607
                CCManager := &CCManager{
×
608
                        ctx: ctx,
×
609
                }
×
610
                err := CCManager.validate()
×
611
                if err != nil {
×
612
                        return fmt.Errorf("error validating CC Manager installation: %w", err)
×
613
                }
×
614
                return nil
×
615
        default:
×
616
                return fmt.Errorf("invalid component specified for validation: %s", componentFlag)
×
617
        }
618
}
619

620
func runCommand(command string, args []string, silent bool) error {
×
621
        cmd := exec.Command(command, args...)
×
622
        if !silent {
×
623
                cmd.Stdout = os.Stdout
×
624
                cmd.Stderr = os.Stderr
×
625
        }
×
626
        return cmd.Run()
×
627
}
628

629
func runCommandWithWait(command string, args []string, sleepSeconds int, silent bool) error {
×
630
        for {
×
631
                cmd := exec.Command(command, args...)
×
632
                if !silent {
×
633
                        cmd.Stdout = os.Stdout
×
634
                        cmd.Stderr = os.Stderr
×
635
                }
×
636
                fmt.Printf("running command %s with args %v\n", command, args)
×
637
                err := cmd.Run()
×
638
                if err != nil {
×
639
                        log.Warningf("error running command: %v", err)
×
640
                        fmt.Printf("command failed, retrying after %d seconds\n", sleepSeconds)
×
641
                        time.Sleep(time.Duration(sleepSeconds) * time.Second)
×
642
                        continue
×
643
                }
644
                return nil
×
645
        }
646
}
647

648
// prependPathListEnvvar prepends a specified list of strings to a specified envvar and returns its value.
649
func prependPathListEnvvar(envvar string, prepend ...string) string {
×
650
        if len(prepend) == 0 {
×
651
                return os.Getenv(envvar)
×
652
        }
×
653
        current := filepath.SplitList(os.Getenv(envvar))
×
654
        return strings.Join(append(prepend, current...), string(filepath.ListSeparator))
×
655
}
656

657
// setEnvVar adds or updates an envar to the list of specified envvars and returns it.
658
func setEnvVar(envvars []string, key, value string) []string {
×
659
        var updated []string
×
660
        for _, envvar := range envvars {
×
661
                pair := strings.SplitN(envvar, "=", 2)
×
662
                if pair[0] == key {
×
663
                        continue
×
664
                }
665
                updated = append(updated, envvar)
×
666
        }
667
        return append(updated, fmt.Sprintf("%s=%s", key, value))
×
668
}
669

670
// For driver container installs, check existence of .driver-ctr-ready to confirm running driver
671
// container has completed and is in Ready state.
672
func assertDriverContainerReady(silent bool) error {
×
673
        command := shell
×
674
        args := []string{"-c", "stat /run/nvidia/validations/.driver-ctr-ready"}
×
675

×
676
        if withWaitFlag {
×
677
                return runCommandWithWait(command, args, sleepIntervalSecondsFlag, silent)
×
678
        }
×
679

680
        return runCommand(command, args, silent)
×
681
}
682

683
// isDriverManagedByOperator determines if the NVIDIA driver is managed by the GPU Operator.
684
// We check if at least one driver DaemonSet exists in the operator namespace that is
685
// owned by the ClusterPolicy or NVIDIADriver controllers.
686
func isDriverManagedByOperator(ctx context.Context) (bool, error) {
×
687
        kubeConfig, err := rest.InClusterConfig()
×
688
        if err != nil {
×
689
                return false, fmt.Errorf("error getting cluster config: %w", err)
×
690
        }
×
691

692
        kubeClient, err := kubernetes.NewForConfig(kubeConfig)
×
693
        if err != nil {
×
694
                return false, fmt.Errorf("error getting k8s client: %w", err)
×
695
        }
×
696

697
        opts := meta_v1.ListOptions{LabelSelector: labels.Set{appComponentLabelKey: "nvidia-driver"}.AsSelector().String()}
×
698
        dsList, err := kubeClient.AppsV1().DaemonSets(namespaceFlag).List(ctx, opts)
×
699
        if err != nil {
×
700
                return false, fmt.Errorf("error listing daemonsets: %w", err)
×
701
        }
×
702

703
        for i := range dsList.Items {
×
704
                ds := dsList.Items[i]
×
705
                owner := meta_v1.GetControllerOf(&ds)
×
706
                if owner == nil {
×
707
                        continue
×
708
                }
709
                if strings.HasPrefix(owner.APIVersion, "nvidia.com/") && (owner.Kind == nvidiav1.ClusterPolicyCRDName || owner.Kind == nvidiav1alpha1.NVIDIADriverCRDName) {
×
710
                        return true, nil
×
711
                }
×
712
        }
713

714
        return false, nil
×
715
}
716

717
func validateHostDriver(silent bool) error {
×
718
        log.Info("Attempting to validate a pre-installed driver on the host")
×
719
        if fileInfo, err := os.Lstat(filepath.Join("/host", wslNvidiaSMIPath)); err == nil && fileInfo.Size() != 0 {
×
720
                log.Infof("WSL2 system detected, assuming driver is pre-installed")
×
721
                disableDevCharSymlinkCreation = true
×
722
                return nil
×
723
        }
×
724
        fileInfo, err := os.Lstat("/host/usr/bin/nvidia-smi")
×
725
        if err != nil {
×
726
                return fmt.Errorf("no 'nvidia-smi' file present on the host: %w", err)
×
727
        }
×
728
        if fileInfo.Size() == 0 {
×
729
                return fmt.Errorf("empty 'nvidia-smi' file found on the host")
×
730
        }
×
731
        command := "chroot"
×
732
        args := []string{"/host", "nvidia-smi"}
×
733

×
734
        return runCommand(command, args, silent)
×
735
}
736

737
func validateDriverContainer(silent bool, ctx context.Context) error {
×
738
        driverManagedByOperator, err := isDriverManagedByOperator(ctx)
×
739
        if err != nil {
×
740
                return fmt.Errorf("error checking if driver is managed by GPU Operator: %w", err)
×
741
        }
×
742

743
        if driverManagedByOperator {
×
744
                log.Infof("Driver is not pre-installed on the host and is managed by GPU Operator. Checking driver container status.")
×
745
                if err := assertDriverContainerReady(silent); err != nil {
×
746
                        return fmt.Errorf("error checking driver container status: %w", err)
×
747
                }
×
748
        }
749

750
        driverRoot := root(driverInstallDirCtrPathFlag)
×
751

×
752
        validateDriver := func(silent bool) error {
×
753
                driverLibraryPath, err := driverRoot.getDriverLibraryPath()
×
754
                if err != nil {
×
755
                        return fmt.Errorf("failed to locate driver libraries: %w", err)
×
756
                }
×
757

758
                nvidiaSMIPath, err := driverRoot.getNvidiaSMIPath()
×
759
                if err != nil {
×
760
                        return fmt.Errorf("failed to locate nvidia-smi: %w", err)
×
761
                }
×
762
                cmd := exec.Command(nvidiaSMIPath)
×
763
                // In order for nvidia-smi to run, we need to update LD_PRELOAD to include the path to libnvidia-ml.so.1.
×
764
                cmd.Env = setEnvVar(os.Environ(), "LD_PRELOAD", prependPathListEnvvar("LD_PRELOAD", driverLibraryPath))
×
765
                if !silent {
×
766
                        cmd.Stdout = os.Stdout
×
767
                        cmd.Stderr = os.Stderr
×
768
                }
×
769
                return cmd.Run()
×
770
        }
771

772
        for {
×
773
                log.Info("Attempting to validate a driver container installation")
×
774
                err := validateDriver(silent)
×
775
                if err != nil {
×
776
                        if !withWaitFlag {
×
777
                                return fmt.Errorf("error validating driver: %w", err)
×
778
                        }
×
779
                        log.Warningf("failed to validate the driver, retrying after %d seconds\n", sleepIntervalSecondsFlag)
×
780
                        time.Sleep(time.Duration(sleepIntervalSecondsFlag) * time.Second)
×
781
                        continue
×
782
                }
783
                return nil
×
784
        }
785
}
786

787
func (d *Driver) runValidation(silent bool) (driverInfo, error) {
×
788
        err := validateHostDriver(silent)
×
789
        if err == nil {
×
790
                log.Info("Detected a pre-installed driver on the host")
×
791
                return getDriverInfo(true, hostRootFlag, hostRootFlag, "/host"), nil
×
792
        }
×
793

794
        err = validateDriverContainer(silent, d.ctx)
×
795
        if err != nil {
×
796
                return driverInfo{}, err
×
797
        }
×
798
        return getDriverInfo(false, hostRootFlag, driverInstallDirFlag, driverInstallDirCtrPathFlag), nil
×
799
}
800

801
func (d *Driver) validate() error {
×
802
        // delete driver status file is already present
×
803
        err := deleteStatusFile(outputDirFlag + "/" + driverStatusFile)
×
804
        if err != nil {
×
805
                return err
×
806
        }
×
807

808
        driverInfo, err := d.runValidation(false)
×
809
        if err != nil {
×
810
                log.Errorf("driver is not ready: %v", err)
×
811
                return err
×
812
        }
×
813

814
        err = createDevCharSymlinks(driverInfo, disableDevCharSymlinkCreation)
×
815
        if err != nil {
×
816
                msg := strings.Join([]string{
×
817
                        "Failed to create symlinks under /dev/char that point to all possible NVIDIA character devices.",
×
818
                        "The existence of these symlinks is required to address the following bug:",
×
819
                        "",
×
820
                        "    https://github.com/NVIDIA/gpu-operator/issues/430",
×
821
                        "",
×
822
                        "This bug impacts container runtimes configured with systemd cgroup management enabled.",
×
823
                        "To disable the symlink creation, set the following envvar in ClusterPolicy:",
×
824
                        "",
×
825
                        "    validator:",
×
826
                        "      driver:",
×
827
                        "        env:",
×
828
                        "        - name: DISABLE_DEV_CHAR_SYMLINK_CREATION",
×
829
                        "          value: \"true\""}, "\n")
×
830
                return fmt.Errorf("%w\n\n%s", err, msg)
×
831
        }
×
832

833
        return d.createStatusFile(driverInfo)
×
834
}
835

836
func (d *Driver) createStatusFile(driverInfo driverInfo) error {
×
837
        statusFileContent := strings.Join([]string{
×
838
                fmt.Sprintf("IS_HOST_DRIVER=%t", driverInfo.isHostDriver),
×
839
                fmt.Sprintf("NVIDIA_DRIVER_ROOT=%s", driverInfo.driverRoot),
×
840
                fmt.Sprintf("DRIVER_ROOT_CTR_PATH=%s", driverInfo.driverRootCtrPath),
×
841
                fmt.Sprintf("NVIDIA_DEV_ROOT=%s", driverInfo.devRoot),
×
842
                fmt.Sprintf("DEV_ROOT_CTR_PATH=%s", driverInfo.devRootCtrPath),
×
843
        }, "\n") + "\n"
×
844

×
845
        // create driver status file
×
846
        return createStatusFileWithContent(outputDirFlag+"/"+driverStatusFile, statusFileContent)
×
847
}
×
848

849
// areNvidiaModulesLoaded checks if NVIDIA kernel modules are already loaded in kernel memory.
NEW
850
func areNvidiaModulesLoaded() bool {
×
NEW
851
        // Check if the nvidia module is loaded by checking if /sys/module/nvidia/refcnt exists
×
NEW
852
        if _, err := os.Stat("/sys/module/nvidia/refcnt"); err == nil {
×
NEW
853
                refcntData, err := os.ReadFile("/sys/module/nvidia/refcnt")
×
NEW
854
                if err == nil {
×
NEW
855
                        refcnt := strings.TrimSpace(string(refcntData))
×
NEW
856
                        log.Infof("NVIDIA kernel modules already loaded in kernel memory (refcnt=%s)", refcnt)
×
NEW
857
                        return true
×
NEW
858
                }
×
859
        }
NEW
860
        return false
×
861
}
862

863
// createDevCharSymlinks creates symlinks in /host-dev-char that point to all possible NVIDIA devices nodes.
864
func createDevCharSymlinks(driverInfo driverInfo, disableDevCharSymlinkCreation bool) error {
×
865
        if disableDevCharSymlinkCreation {
×
866
                log.WithField("disableDevCharSymlinkCreation", true).
×
867
                        Info("skipping the creation of symlinks under /dev/char that correspond to NVIDIA character devices")
×
868
                return nil
×
869
        }
×
870

871
        log.Info("creating symlinks under /dev/char that correspond to NVIDIA character devices")
×
872

×
NEW
873
        // Check if NVIDIA modules are already loaded in kernel memory.
×
NEW
874
        // If they are, we don't need to run modprobe (which would fail if modules aren't in /lib/modules/).
×
NEW
875
        // This handles the case where the driver container performed a userspace-only install
×
NEW
876
        // after detecting that modules were already loaded from a previous boot.
×
NEW
877
        modulesAlreadyLoaded := areNvidiaModulesLoaded()
×
NEW
878

×
NEW
879
        // Only attempt to load NVIDIA kernel modules when:
×
NEW
880
        // 1. Modules are not already loaded in kernel memory, AND
×
NEW
881
        // 2. We can chroot into driverRoot to run modprobe
×
NEW
882
        loadKernelModules := !modulesAlreadyLoaded && (driverInfo.isHostDriver || (driverInfo.devRoot == driverInfo.driverRoot))
×
883

×
884
        // driverRootCtrPath is the path of the driver install dir in the container. This will either be
×
885
        // driverInstallDirCtrPathFlag or '/host'.
×
886
        // Note, if we always mounted the driver install dir to '/driver-root' in the validation container
×
887
        // instead, then we could simplify to always use driverInfo.driverRootCtrPath -- which would be
×
888
        // either '/host' or '/driver-root', both paths would exist in the validation container.
×
889
        driverRootCtrPath := driverInstallDirCtrPathFlag
×
890
        if driverInfo.isHostDriver {
×
891
                driverRootCtrPath = "/host"
×
892
        }
×
893

894
        // We now create the symlinks in /dev/char.
895
        creator, err := devchar.NewSymlinkCreator(
×
896
                devchar.WithDriverRoot(driverRootCtrPath),
×
897
                devchar.WithDevRoot(driverInfo.devRoot),
×
898
                devchar.WithDevCharPath(hostDevCharPath),
×
899
                devchar.WithCreateAll(true),
×
900
                devchar.WithCreateDeviceNodes(true),
×
901
                devchar.WithLoadKernelModules(loadKernelModules),
×
902
        )
×
903
        if err != nil {
×
904
                return fmt.Errorf("error creating symlink creator: %w", err)
×
905
        }
×
906

907
        err = creator.CreateLinks()
×
908
        if err != nil {
×
909
                return fmt.Errorf("error creating symlinks: %w", err)
×
910
        }
×
911

912
        return nil
×
913
}
914

915
func createStatusFile(statusFile string) error {
×
916
        _, err := os.Create(statusFile)
×
917
        if err != nil {
×
918
                return fmt.Errorf("unable to create status file %s: %s", statusFile, err)
×
919
        }
×
920
        return nil
×
921
}
922

923
func createStatusFileWithContent(statusFile string, content string) error {
×
924
        dir := filepath.Dir(statusFile)
×
925
        tmpFile, err := os.CreateTemp(dir, filepath.Base(statusFile)+".*.tmp")
×
926
        if err != nil {
×
927
                return fmt.Errorf("failed to create temporary status file: %w", err)
×
928
        }
×
929
        _, err = tmpFile.WriteString(content)
×
930
        tmpFile.Close()
×
931
        if err != nil {
×
932
                return fmt.Errorf("failed to write temporary status file: %w", err)
×
933
        }
×
934
        defer func() {
×
935
                _ = os.Remove(tmpFile.Name())
×
936
        }()
×
937

938
        if err := os.Rename(tmpFile.Name(), statusFile); err != nil {
×
939
                return fmt.Errorf("error moving temporary file to '%s': %w", statusFile, err)
×
940
        }
×
941
        return nil
×
942
}
943

944
func deleteStatusFile(statusFile string) error {
×
945
        err := os.Remove(statusFile)
×
946
        if err != nil {
×
947
                if !os.IsNotExist(err) {
×
948
                        return fmt.Errorf("unable to remove driver status file %s: %w", statusFile, err)
×
949
                }
×
950
                // status file already removed
951
        }
952
        return nil
×
953
}
954

955
func (n *NvidiaFs) validate() error {
×
956
        // delete driver status file if already present
×
957
        err := deleteStatusFile(outputDirFlag + "/" + nvidiaFsStatusFile)
×
958
        if err != nil {
×
959
                return err
×
960
        }
×
961

962
        err = n.runValidation(false)
×
963
        if err != nil {
×
964
                fmt.Println("nvidia-fs driver is not ready")
×
965
                return err
×
966
        }
×
967

968
        // create driver status file
969
        err = createStatusFile(outputDirFlag + "/" + nvidiaFsStatusFile)
×
970
        if err != nil {
×
971
                return err
×
972
        }
×
973
        return nil
×
974
}
975

976
func (n *NvidiaFs) runValidation(silent bool) error {
×
977
        // check for nvidia_fs module to be loaded
×
978
        command := shell
×
979
        args := []string{"-c", "lsmod | grep nvidia_fs"}
×
980

×
981
        if withWaitFlag {
×
982
                return runCommandWithWait(command, args, sleepIntervalSecondsFlag, silent)
×
983
        }
×
984
        return runCommand(command, args, silent)
×
985
}
986

987
func (g *GDRCopy) validate() error {
×
988
        // delete driver status file if already present
×
989
        err := deleteStatusFile(outputDirFlag + "/" + gdrCopyStatusFile)
×
990
        if err != nil {
×
991
                return err
×
992
        }
×
993

994
        err = g.runValidation(false)
×
995
        if err != nil {
×
996
                log.Info("gdrcopy driver is not ready")
×
997
                return err
×
998
        }
×
999

1000
        // create driver status file
1001
        err = createStatusFile(outputDirFlag + "/" + gdrCopyStatusFile)
×
1002
        if err != nil {
×
1003
                return err
×
1004
        }
×
1005
        return nil
×
1006
}
1007

1008
func (g *GDRCopy) runValidation(silent bool) error {
×
1009
        // check for gdrdrv module to be loaded
×
1010
        command := shell
×
1011
        args := []string{"-c", "lsmod | grep -E '^gdrdrv\\s'"}
×
1012

×
1013
        if withWaitFlag {
×
1014
                return runCommandWithWait(command, args, sleepIntervalSecondsFlag, silent)
×
1015
        }
×
1016
        return runCommand(command, args, silent)
×
1017
}
1018

1019
func (t *Toolkit) validate() error {
×
1020
        // delete status file is already present
×
1021
        err := deleteStatusFile(outputDirFlag + "/" + toolkitStatusFile)
×
1022
        if err != nil {
×
1023
                return err
×
1024
        }
×
1025

1026
        // invoke nvidia-smi command to check if container run with toolkit injected files
1027
        command := "nvidia-smi"
×
1028
        args := []string{}
×
1029
        if withWaitFlag {
×
1030
                err = runCommandWithWait(command, args, sleepIntervalSecondsFlag, false)
×
1031
        } else {
×
1032
                err = runCommand(command, args, false)
×
1033
        }
×
1034
        if err != nil {
×
1035
                fmt.Println("toolkit is not ready")
×
1036
                return err
×
1037
        }
×
1038

1039
        // create toolkit status file
1040
        err = createStatusFile(outputDirFlag + "/" + toolkitStatusFile)
×
1041
        if err != nil {
×
1042
                return err
×
1043
        }
×
1044
        return nil
×
1045
}
1046

1047
func (p *Plugin) validate() error {
×
1048
        // delete status file is already present
×
1049
        err := deleteStatusFile(outputDirFlag + "/" + pluginStatusFile)
×
1050
        if err != nil {
×
1051
                return err
×
1052
        }
×
1053

1054
        // enumerate node resources and ensure GPU devices are discovered.
1055
        kubeConfig, err := rest.InClusterConfig()
×
1056
        if err != nil {
×
1057
                log.Errorf("Error getting config cluster - %s\n", err.Error())
×
1058
                return err
×
1059
        }
×
1060

1061
        kubeClient, err := kubernetes.NewForConfig(kubeConfig)
×
1062
        if err != nil {
×
1063
                log.Errorf("Error getting k8s client - %s\n", err.Error())
×
1064
                return err
×
1065
        }
×
1066

1067
        // update k8s client for the plugin
1068
        p.setKubeClient(kubeClient)
×
1069

×
1070
        err = p.validateGPUResource()
×
1071
        if err != nil {
×
1072
                return err
×
1073
        }
×
1074

1075
        if withWorkloadFlag {
×
1076
                // workload test
×
1077
                err = p.runWorkload()
×
1078
                if err != nil {
×
1079
                        return err
×
1080
                }
×
1081
        }
1082

1083
        // create plugin status file
1084
        err = createStatusFile(outputDirFlag + "/" + pluginStatusFile)
×
1085
        if err != nil {
×
1086
                return err
×
1087
        }
×
1088
        return nil
×
1089
}
1090

1091
func (m *MOFED) validate() error {
×
1092
        // If GPUDirectRDMA is disabled, skip validation
×
1093
        if os.Getenv(GPUDirectRDMAEnabledEnvName) != "true" {
×
1094
                log.Info("GPUDirect RDMA is disabled, skipping MOFED driver validation...")
×
1095
                return nil
×
1096
        }
×
1097

1098
        // Check node labels for Mellanox devices and MOFED driver status file
1099
        kubeConfig, err := rest.InClusterConfig()
×
1100
        if err != nil {
×
1101
                log.Errorf("Error getting config cluster - %s\n", err.Error())
×
1102
                return err
×
1103
        }
×
1104

1105
        kubeClient, err := kubernetes.NewForConfig(kubeConfig)
×
1106
        if err != nil {
×
1107
                log.Errorf("Error getting k8s client - %s\n", err.Error())
×
1108
                return err
×
1109
        }
×
1110

1111
        // update k8s client for the mofed driver validation
1112
        m.setKubeClient(kubeClient)
×
1113

×
1114
        present, err := m.isMellanoxDevicePresent()
×
1115
        if err != nil {
×
1116
                log.Errorf("Error trying to retrieve Mellanox device - %s\n", err.Error())
×
1117
                return err
×
1118
        }
×
1119
        if !present {
×
1120
                log.Info("No Mellanox device label found, skipping MOFED driver validation...")
×
1121
                return nil
×
1122
        }
×
1123

1124
        // delete status file is already present
1125
        err = deleteStatusFile(outputDirFlag + "/" + mofedStatusFile)
×
1126
        if err != nil {
×
1127
                return err
×
1128
        }
×
1129

1130
        err = m.runValidation(false)
×
1131
        if err != nil {
×
1132
                return err
×
1133
        }
×
1134

1135
        // delete status file is already present
1136
        err = createStatusFile(outputDirFlag + "/" + mofedStatusFile)
×
1137
        if err != nil {
×
1138
                return err
×
1139
        }
×
1140
        return nil
×
1141
}
1142

1143
func (m *MOFED) runValidation(silent bool) error {
×
1144
        // check for mlx5_core module to be loaded
×
1145
        command := shell
×
1146
        args := []string{"-c", "lsmod | grep mlx5_core"}
×
1147

×
1148
        // If MOFED container is running then use readiness flag set by the driver container instead
×
1149
        if os.Getenv(UseHostMOFEDEnvname) != "true" {
×
1150
                args = []string{"-c", "stat /run/mellanox/drivers/.driver-ready"}
×
1151
        }
×
1152
        if withWaitFlag {
×
1153
                return runCommandWithWait(command, args, sleepIntervalSecondsFlag, silent)
×
1154
        }
×
1155
        return runCommand(command, args, silent)
×
1156
}
1157

1158
func (m *MOFED) setKubeClient(kubeClient kubernetes.Interface) {
×
1159
        m.kubeClient = kubeClient
×
1160
}
×
1161

1162
func (m *MOFED) isMellanoxDevicePresent() (bool, error) {
×
1163
        node, err := getNode(m.ctx, m.kubeClient)
×
1164
        if err != nil {
×
1165
                return false, fmt.Errorf("unable to fetch node by name %s to check for Mellanox device label: %s", nodeNameFlag, err)
×
1166
        }
×
1167
        for key, value := range node.GetLabels() {
×
1168
                if key == MellanoxDeviceLabelKey && value == "true" {
×
1169
                        return true, nil
×
1170
                }
×
1171
        }
1172
        return false, nil
×
1173
}
1174

1175
func (p *Plugin) runWorkload() error {
×
1176
        ctx := p.ctx
×
1177
        // load podSpec
×
1178
        pod, err := loadPodSpec(pluginWorkloadPodSpecPath)
×
1179
        if err != nil {
×
1180
                return err
×
1181
        }
×
1182

1183
        pod.Namespace = namespaceFlag
×
1184
        image := os.Getenv(validatorImageEnvName)
×
1185
        pod.Spec.Containers[0].Image = image
×
1186
        pod.Spec.InitContainers[0].Image = image
×
1187

×
1188
        imagePullPolicy := os.Getenv(validatorImagePullPolicyEnvName)
×
1189
        if imagePullPolicy != "" {
×
1190
                pod.Spec.Containers[0].ImagePullPolicy = corev1.PullPolicy(imagePullPolicy)
×
1191
                pod.Spec.InitContainers[0].ImagePullPolicy = corev1.PullPolicy(imagePullPolicy)
×
1192
        }
×
1193

1194
        if os.Getenv(validatorImagePullSecretsEnvName) != "" {
×
1195
                pullSecrets := strings.Split(os.Getenv(validatorImagePullSecretsEnvName), ",")
×
1196
                for _, secret := range pullSecrets {
×
1197
                        pod.Spec.ImagePullSecrets = append(pod.Spec.ImagePullSecrets, corev1.LocalObjectReference{Name: secret})
×
1198
                }
×
1199
        }
1200
        if os.Getenv(validatorRuntimeClassEnvName) != "" {
×
1201
                runtimeClass := os.Getenv(validatorRuntimeClassEnvName)
×
1202
                pod.Spec.RuntimeClassName = &runtimeClass
×
1203
        }
×
1204

1205
        validatorDaemonset, err := p.kubeClient.AppsV1().DaemonSets(namespaceFlag).Get(ctx, "nvidia-operator-validator", meta_v1.GetOptions{})
×
1206
        if err != nil {
×
1207
                return fmt.Errorf("unable to retrieve the operator validator daemonset: %w", err)
×
1208
        }
×
1209

1210
        // update owner reference
1211
        pod.SetOwnerReferences(validatorDaemonset.OwnerReferences)
×
1212
        // set pod tolerations
×
1213
        pod.Spec.Tolerations = validatorDaemonset.Spec.Template.Spec.Tolerations
×
1214
        // update podSpec with node name, so it will just run on current node
×
1215
        pod.Spec.NodeName = nodeNameFlag
×
1216

×
1217
        resourceName, err := p.getGPUResourceName()
×
1218
        if err != nil {
×
1219
                return err
×
1220
        }
×
1221

1222
        gpuResource := corev1.ResourceList{
×
1223
                resourceName: resource.MustParse("1"),
×
1224
        }
×
1225

×
1226
        pod.Spec.InitContainers[0].Resources.Limits = gpuResource
×
1227
        pod.Spec.InitContainers[0].Resources.Requests = gpuResource
×
1228
        opts := meta_v1.ListOptions{LabelSelector: labels.Set{"app": pluginValidatorLabelValue}.AsSelector().String(),
×
1229
                FieldSelector: fields.Set{"spec.nodeName": nodeNameFlag}.AsSelector().String()}
×
1230

×
1231
        // check if plugin validation pod is already running and cleanup.
×
1232
        podList, err := p.kubeClient.CoreV1().Pods(namespaceFlag).List(ctx, opts)
×
1233
        if err != nil {
×
1234
                return fmt.Errorf("cannot list existing validation pods: %w", err)
×
1235
        }
×
1236

1237
        if podList != nil && len(podList.Items) > 0 {
×
1238
                propagation := meta_v1.DeletePropagationBackground
×
1239
                gracePeriod := int64(0)
×
1240
                options := meta_v1.DeleteOptions{PropagationPolicy: &propagation, GracePeriodSeconds: &gracePeriod}
×
1241
                err = p.kubeClient.CoreV1().Pods(namespaceFlag).Delete(ctx, podList.Items[0].Name, options)
×
1242
                if err != nil {
×
1243
                        return fmt.Errorf("cannot delete previous validation pod: %w", err)
×
1244
                }
×
1245
        }
1246

1247
        // wait for plugin validation pod to be ready.
1248
        newPod, err := p.kubeClient.CoreV1().Pods(namespaceFlag).Create(ctx, pod, meta_v1.CreateOptions{})
×
1249
        if err != nil {
×
1250
                return fmt.Errorf("failed to create plugin validation pod %s, err %w", pod.Name, err)
×
1251
        }
×
1252

1253
        // make sure it's available
1254
        err = waitForPod(ctx, p.kubeClient, newPod.Name, namespaceFlag)
×
1255
        if err != nil {
×
1256
                return err
×
1257
        }
×
1258
        return nil
×
1259
}
1260

1261
// waits for the pod to be created
1262
func waitForPod(ctx context.Context, kubeClient kubernetes.Interface, name string, namespace string) error {
×
1263
        for i := 0; i < podCreationWaitRetries; i++ {
×
1264
                // check for the existence of the resource
×
1265
                pod, err := kubeClient.CoreV1().Pods(namespace).Get(ctx, name, meta_v1.GetOptions{})
×
1266
                if err != nil {
×
1267
                        return fmt.Errorf("failed to get pod %s, err %w", name, err)
×
1268
                }
×
1269
                if pod.Status.Phase != "Succeeded" {
×
1270
                        log.Infof("pod %s is curently in %s phase", name, pod.Status.Phase)
×
1271
                        time.Sleep(podCreationSleepIntervalSeconds * time.Second)
×
1272
                        continue
×
1273
                }
1274
                log.Infof("pod %s have run successfully", name)
×
1275
                // successfully running
×
1276
                return nil
×
1277
        }
1278
        return fmt.Errorf("gave up waiting for pod %s to be available", name)
×
1279
}
1280

1281
func loadPodSpec(podSpecPath string) (*corev1.Pod, error) {
×
1282
        var pod corev1.Pod
×
1283
        manifest, err := os.ReadFile(podSpecPath)
×
1284
        if err != nil {
×
1285
                panic(err)
×
1286
        }
1287
        s := json.NewSerializerWithOptions(json.DefaultMetaFactory, scheme.Scheme,
×
1288
                scheme.Scheme, json.SerializerOptions{Yaml: true, Pretty: false, Strict: false})
×
1289
        reg := regexp.MustCompile(`\b(\w*kind:\w*)\B.*\b`)
×
1290

×
1291
        kind := reg.FindString(string(manifest))
×
1292
        slice := strings.Split(kind, ":")
×
1293
        kind = strings.TrimSpace(slice[1])
×
1294

×
1295
        log.Debugf("Decoding for Kind %s in path: %s", kind, podSpecPath)
×
1296
        _, _, err = s.Decode(manifest, nil, &pod)
×
1297
        if err != nil {
×
1298
                return nil, err
×
1299
        }
×
1300
        return &pod, nil
×
1301
}
1302

1303
func (p *Plugin) countGPUResources() (int64, error) {
×
1304
        // get node info to check discovered GPU resources
×
1305
        node, err := getNode(p.ctx, p.kubeClient)
×
1306
        if err != nil {
×
1307
                return -1, fmt.Errorf("unable to fetch node by name %s to check for GPU resources: %w", nodeNameFlag, err)
×
1308
        }
×
1309

1310
        count := int64(0)
×
1311

×
1312
        for resourceName, quantity := range node.Status.Capacity {
×
1313
                if !strings.HasPrefix(string(resourceName), migGPUResourcePrefix) && !strings.HasPrefix(string(resourceName), genericGPUResourceType) {
×
1314
                        continue
×
1315
                }
1316

1317
                count += quantity.Value()
×
1318
        }
1319
        return count, nil
×
1320
}
1321

1322
func (p *Plugin) validateGPUResource() error {
×
1323
        for retry := 1; retry <= gpuResourceDiscoveryWaitRetries; retry++ {
×
1324
                // get node info to check discovered GPU resources
×
1325
                node, err := getNode(p.ctx, p.kubeClient)
×
1326
                if err != nil {
×
1327
                        return fmt.Errorf("unable to fetch node by name %s to check for GPU resources: %s", nodeNameFlag, err)
×
1328
                }
×
1329

1330
                if p.availableMIGResourceName(node.Status.Capacity) != "" {
×
1331
                        return nil
×
1332
                }
×
1333

1334
                if p.availableGenericResourceName(node.Status.Capacity) != "" {
×
1335
                        return nil
×
1336
                }
×
1337

1338
                log.Infof("GPU resources are not yet discovered by the node, retry: %d", retry)
×
1339
                time.Sleep(gpuResourceDiscoveryIntervalSeconds * time.Second)
×
1340
        }
1341
        return fmt.Errorf("GPU resources are not discovered by the node")
×
1342
}
1343

1344
func (p *Plugin) availableMIGResourceName(resources corev1.ResourceList) corev1.ResourceName {
×
1345
        for resourceName, quantity := range resources {
×
1346
                if strings.HasPrefix(string(resourceName), migGPUResourcePrefix) && quantity.Value() >= 1 {
×
1347
                        log.Debugf("Found MIG GPU resource name %s quantity %d", resourceName, quantity.Value())
×
1348
                        return resourceName
×
1349
                }
×
1350
        }
1351
        return ""
×
1352
}
1353

1354
func (p *Plugin) availableGenericResourceName(resources corev1.ResourceList) corev1.ResourceName {
×
1355
        for resourceName, quantity := range resources {
×
1356
                if strings.HasPrefix(string(resourceName), genericGPUResourceType) && quantity.Value() >= 1 {
×
1357
                        log.Debugf("Found GPU resource name %s quantity %d", resourceName, quantity.Value())
×
1358
                        return resourceName
×
1359
                }
×
1360
        }
1361
        return ""
×
1362
}
1363

1364
func (p *Plugin) getGPUResourceName() (corev1.ResourceName, error) {
×
1365
        // get node info to check allocatable GPU resources
×
1366
        node, err := getNode(p.ctx, p.kubeClient)
×
1367
        if err != nil {
×
1368
                return "", fmt.Errorf("unable to fetch node by name %s to check for GPU resources: %s", nodeNameFlag, err)
×
1369
        }
×
1370

1371
        // use mig resource if one is available to run workload
1372
        if resourceName := p.availableMIGResourceName(node.Status.Allocatable); resourceName != "" {
×
1373
                return resourceName, nil
×
1374
        }
×
1375

1376
        if resourceName := p.availableGenericResourceName(node.Status.Allocatable); resourceName != "" {
×
1377
                return resourceName, nil
×
1378
        }
×
1379

1380
        return "", fmt.Errorf("unable to find any allocatable GPU resource")
×
1381
}
1382

1383
func (p *Plugin) setKubeClient(kubeClient kubernetes.Interface) {
×
1384
        p.kubeClient = kubeClient
×
1385
}
×
1386

1387
func getNode(ctx context.Context, kubeClient kubernetes.Interface) (*corev1.Node, error) {
×
1388
        node, err := kubeClient.CoreV1().Nodes().Get(ctx, nodeNameFlag, meta_v1.GetOptions{})
×
1389
        if err != nil {
×
1390
                log.Errorf("unable to get node with name %s, err %v", nodeNameFlag, err)
×
1391
                return nil, err
×
1392
        }
×
1393
        return node, nil
×
1394
}
1395

1396
func (c *CUDA) validate() error {
×
1397
        // delete status file is already present
×
1398
        err := deleteStatusFile(outputDirFlag + "/" + cudaStatusFile)
×
1399
        if err != nil {
×
1400
                return err
×
1401
        }
×
1402

1403
        // deploy workload pod for cuda validation
1404
        kubeConfig, err := rest.InClusterConfig()
×
1405
        if err != nil {
×
1406
                log.Errorf("Error getting config cluster - %s\n", err.Error())
×
1407
                return err
×
1408
        }
×
1409

1410
        kubeClient, err := kubernetes.NewForConfig(kubeConfig)
×
1411
        if err != nil {
×
1412
                log.Errorf("Error getting k8s client - %s\n", err.Error())
×
1413
                return err
×
1414
        }
×
1415

1416
        // update k8s client for the plugin
1417
        c.setKubeClient(kubeClient)
×
1418

×
1419
        if withWorkloadFlag {
×
1420
                // workload test
×
1421
                err = c.runWorkload()
×
1422
                if err != nil {
×
1423
                        return err
×
1424
                }
×
1425
        }
1426

1427
        // create plugin status file
1428
        err = createStatusFile(outputDirFlag + "/" + cudaStatusFile)
×
1429
        if err != nil {
×
1430
                return err
×
1431
        }
×
1432
        return nil
×
1433
}
1434

1435
func (c *CUDA) setKubeClient(kubeClient kubernetes.Interface) {
×
1436
        c.kubeClient = kubeClient
×
1437
}
×
1438

1439
func (c *CUDA) runWorkload() error {
×
1440
        ctx := c.ctx
×
1441

×
1442
        // load podSpec
×
1443
        pod, err := loadPodSpec(cudaWorkloadPodSpecPath)
×
1444
        if err != nil {
×
1445
                return err
×
1446
        }
×
1447
        pod.Namespace = namespaceFlag
×
1448
        image := os.Getenv(validatorImageEnvName)
×
1449
        pod.Spec.Containers[0].Image = image
×
1450
        pod.Spec.InitContainers[0].Image = image
×
1451

×
1452
        imagePullPolicy := os.Getenv(validatorImagePullPolicyEnvName)
×
1453
        if imagePullPolicy != "" {
×
1454
                pod.Spec.Containers[0].ImagePullPolicy = corev1.PullPolicy(imagePullPolicy)
×
1455
                pod.Spec.InitContainers[0].ImagePullPolicy = corev1.PullPolicy(imagePullPolicy)
×
1456
        }
×
1457

1458
        if os.Getenv(validatorImagePullSecretsEnvName) != "" {
×
1459
                pullSecrets := strings.Split(os.Getenv(validatorImagePullSecretsEnvName), ",")
×
1460
                for _, secret := range pullSecrets {
×
1461
                        pod.Spec.ImagePullSecrets = append(pod.Spec.ImagePullSecrets, corev1.LocalObjectReference{Name: secret})
×
1462
                }
×
1463
        }
1464
        if os.Getenv(validatorRuntimeClassEnvName) != "" {
×
1465
                runtimeClass := os.Getenv(validatorRuntimeClassEnvName)
×
1466
                pod.Spec.RuntimeClassName = &runtimeClass
×
1467
        }
×
1468

1469
        validatorDaemonset, err := c.kubeClient.AppsV1().DaemonSets(namespaceFlag).Get(ctx, "nvidia-operator-validator", meta_v1.GetOptions{})
×
1470
        if err != nil {
×
1471
                return fmt.Errorf("unable to retrieve the operator validator daemonset: %w", err)
×
1472
        }
×
1473

1474
        // update owner reference
1475
        pod.SetOwnerReferences(validatorDaemonset.OwnerReferences)
×
1476
        // set pod tolerations
×
1477
        pod.Spec.Tolerations = validatorDaemonset.Spec.Template.Spec.Tolerations
×
1478
        // update podSpec with node name, so it will just run on current node
×
1479
        pod.Spec.NodeName = nodeNameFlag
×
1480

×
1481
        opts := meta_v1.ListOptions{LabelSelector: labels.Set{"app": cudaValidatorLabelValue}.AsSelector().String(),
×
1482
                FieldSelector: fields.Set{"spec.nodeName": nodeNameFlag}.AsSelector().String()}
×
1483

×
1484
        // check if cuda workload pod is already running and cleanup.
×
1485
        podList, err := c.kubeClient.CoreV1().Pods(namespaceFlag).List(ctx, opts)
×
1486
        if err != nil {
×
1487
                return fmt.Errorf("cannot list existing validation pods: %s", err)
×
1488
        }
×
1489

1490
        if podList != nil && len(podList.Items) > 0 {
×
1491
                propagation := meta_v1.DeletePropagationBackground
×
1492
                gracePeriod := int64(0)
×
1493
                options := meta_v1.DeleteOptions{PropagationPolicy: &propagation, GracePeriodSeconds: &gracePeriod}
×
1494
                err = c.kubeClient.CoreV1().Pods(namespaceFlag).Delete(ctx, podList.Items[0].Name, options)
×
1495
                if err != nil {
×
1496
                        return fmt.Errorf("cannot delete previous validation pod: %s", err)
×
1497
                }
×
1498
        }
1499

1500
        // wait for cuda workload pod to be ready.
1501
        newPod, err := c.kubeClient.CoreV1().Pods(namespaceFlag).Create(ctx, pod, meta_v1.CreateOptions{})
×
1502
        if err != nil {
×
1503
                return fmt.Errorf("failed to create cuda validation pod %s, err %+v", pod.Name, err)
×
1504
        }
×
1505

1506
        // make sure it's available
1507
        err = waitForPod(ctx, c.kubeClient, newPod.Name, namespaceFlag)
×
1508
        if err != nil {
×
1509
                return err
×
1510
        }
×
1511
        return nil
×
1512
}
1513

1514
func (c *Metrics) run() error {
×
1515
        m := NewNodeMetrics(c.ctx, metricsPort)
×
1516

×
1517
        return m.Run()
×
1518
}
×
1519

1520
func (v *VfioPCI) validate() error {
×
1521
        ctx := v.ctx
×
1522

×
1523
        gpuWorkloadConfig, err := getWorkloadConfig(ctx)
×
1524
        if err != nil {
×
1525
                return fmt.Errorf("error getting gpu workload config: %w", err)
×
1526
        }
×
1527
        log.Infof("GPU workload configuration: %s", gpuWorkloadConfig)
×
1528

×
1529
        err = createStatusFileWithContent(filepath.Join(outputDirFlag, workloadTypeStatusFile), gpuWorkloadConfig+"\n")
×
1530
        if err != nil {
×
1531
                return fmt.Errorf("error updating %s status file: %w", workloadTypeStatusFile, err)
×
1532
        }
×
1533

1534
        if gpuWorkloadConfig != gpuWorkloadConfigVMPassthrough {
×
1535
                log.WithFields(log.Fields{
×
1536
                        "gpuWorkloadConfig": gpuWorkloadConfig,
×
1537
                }).Info("vfio-pci not required on the node. Skipping validation.")
×
1538
                return nil
×
1539
        }
×
1540

1541
        // delete status file if already present
1542
        err = deleteStatusFile(outputDirFlag + "/" + vfioPCIStatusFile)
×
1543
        if err != nil {
×
1544
                return err
×
1545
        }
×
1546

1547
        err = v.runValidation()
×
1548
        if err != nil {
×
1549
                return err
×
1550
        }
×
1551
        log.Info("Validation completed successfully - all devices are bound to vfio-pci")
×
1552

×
1553
        // delete status file is already present
×
1554
        err = createStatusFile(outputDirFlag + "/" + vfioPCIStatusFile)
×
1555
        if err != nil {
×
1556
                return err
×
1557
        }
×
1558
        return nil
×
1559
}
1560

1561
func (v *VfioPCI) runValidation() error {
×
1562
        nvpci := nvpci.New()
×
1563
        nvdevices, err := nvpci.GetGPUs()
×
1564
        if err != nil {
×
1565
                return fmt.Errorf("error getting NVIDIA PCI devices: %w", err)
×
1566
        }
×
1567

1568
        for _, dev := range nvdevices {
×
1569
                // TODO: Do not hardcode a list of VFIO driver names. This would be possible if we
×
1570
                // added an API to go-nvlib which returns the most suitable VFIO driver for a GPU,
×
1571
                // using logic similar to https://github.com/NVIDIA/k8s-driver-manager/commit/874c8cd26d775db437f16a42c3e44e74301b3a35
×
1572
                if dev.Driver != "vfio-pci" && dev.Driver != "nvgrace_gpu_vfio_pci" {
×
1573
                        return fmt.Errorf("device not bound to 'vfio-pci'; device: %s driver: '%s'", dev.Address, dev.Driver)
×
1574
                }
×
1575
        }
1576

1577
        return nil
×
1578
}
1579

1580
func (v *VGPUManager) validate() error {
×
1581
        ctx := v.ctx
×
1582

×
1583
        gpuWorkloadConfig, err := getWorkloadConfig(ctx)
×
1584
        if err != nil {
×
1585
                return fmt.Errorf("error getting gpu workload config: %w", err)
×
1586
        }
×
1587
        log.Infof("GPU workload configuration: %s", gpuWorkloadConfig)
×
1588

×
1589
        err = createStatusFileWithContent(filepath.Join(outputDirFlag, workloadTypeStatusFile), gpuWorkloadConfig+"\n")
×
1590
        if err != nil {
×
1591
                return fmt.Errorf("error updating %s status file: %w", workloadTypeStatusFile, err)
×
1592
        }
×
1593

1594
        if gpuWorkloadConfig != gpuWorkloadConfigVMVgpu {
×
1595
                log.WithFields(log.Fields{
×
1596
                        "gpuWorkloadConfig": gpuWorkloadConfig,
×
1597
                }).Info("vGPU Manager not required on the node. Skipping validation.")
×
1598
                return nil
×
1599
        }
×
1600

1601
        // delete status file if already present
1602
        err = deleteStatusFile(outputDirFlag + "/" + vGPUManagerStatusFile)
×
1603
        if err != nil {
×
1604
                return err
×
1605
        }
×
1606

1607
        // delete status file if already present
1608
        err = deleteStatusFile(outputDirFlag + "/" + hostVGPUManagerStatusFile)
×
1609
        if err != nil {
×
1610
                return err
×
1611
        }
×
1612

1613
        hostDriver, err := v.runValidation(false)
×
1614
        if err != nil {
×
1615
                fmt.Println("vGPU Manager is not ready")
×
1616
                return err
×
1617
        }
×
1618

1619
        log.Info("Waiting for VFs to be available...")
×
1620
        if err := waitForVFs(ctx, defaultVFWaitTimeout); err != nil {
×
1621
                return fmt.Errorf("vGPU Manager VFs not ready: %w", err)
×
1622
        }
×
1623

1624
        statusFile := vGPUManagerStatusFile
×
1625
        if hostDriver {
×
1626
                statusFile = hostVGPUManagerStatusFile
×
1627
        }
×
1628

1629
        // create driver status file
1630
        err = createStatusFile(outputDirFlag + "/" + statusFile)
×
1631
        if err != nil {
×
1632
                return err
×
1633
        }
×
1634
        return nil
×
1635
}
1636

1637
func (v *VGPUManager) runValidation(silent bool) (hostDriver bool, err error) {
×
1638
        // invoke validation command
×
1639
        command := "chroot"
×
1640
        args := []string{"/run/nvidia/driver", "nvidia-smi"}
×
1641

×
1642
        // check if driver is pre-installed on the host and use host path for validation
×
1643
        if _, err := os.Lstat("/host/usr/bin/nvidia-smi"); err == nil {
×
1644
                args = []string{"/host", "nvidia-smi"}
×
1645
                hostDriver = true
×
1646
        }
×
1647

1648
        if withWaitFlag {
×
1649
                return hostDriver, runCommandWithWait(command, args, sleepIntervalSecondsFlag, silent)
×
1650
        }
×
1651

1652
        return hostDriver, runCommand(command, args, silent)
×
1653
}
1654

1655
// waitForVFs waits for Virtual Functions to be created on all NVIDIA GPUs.
1656
// It polls sriov_numvfs until all GPUs have their full VF count enabled.
1657
func waitForVFs(ctx context.Context, timeout time.Duration) error {
×
1658
        pollInterval := time.Duration(sleepIntervalSecondsFlag) * time.Second
×
1659
        nvpciLib := nvpci.New()
×
1660

×
1661
        return wait.PollUntilContextTimeout(ctx, pollInterval, timeout, true, func(ctx context.Context) (bool, error) {
×
1662
                gpus, err := nvpciLib.GetGPUs()
×
1663
                if err != nil {
×
1664
                        log.Warnf("Error getting GPUs: %v", err)
×
1665
                        return false, nil
×
1666
                }
×
1667

1668
                var totalExpected, totalEnabled uint64
×
1669
                var pfCount int
×
1670
                for _, gpu := range gpus {
×
1671
                        sriovInfo := gpu.SriovInfo
×
1672
                        if sriovInfo.IsPF() {
×
1673
                                pfCount++
×
1674
                                totalExpected += sriovInfo.PhysicalFunction.TotalVFs
×
1675
                                totalEnabled += sriovInfo.PhysicalFunction.NumVFs
×
1676
                        }
×
1677
                }
1678

1679
                if totalExpected == 0 {
×
1680
                        log.Info("No SR-IOV capable GPUs found, skipping VF wait")
×
1681
                        return true, nil
×
1682
                }
×
1683

1684
                if totalEnabled == totalExpected {
×
1685
                        log.Infof("All %d VF(s) enabled on %d NVIDIA GPU(s)", totalEnabled, pfCount)
×
1686
                        return true, nil
×
1687
                }
×
1688

1689
                log.Infof("Waiting for VFs: %d/%d enabled across %d GPU(s)", totalEnabled, totalExpected, pfCount)
×
1690
                return false, nil
×
1691
        })
1692
}
1693

1694
func (c *CCManager) validate() error {
×
1695
        // delete status file if already present
×
1696
        err := deleteStatusFile(outputDirFlag + "/" + ccManagerStatusFile)
×
1697
        if err != nil {
×
1698
                return err
×
1699
        }
×
1700

1701
        kubeConfig, err := rest.InClusterConfig()
×
1702
        if err != nil {
×
1703
                return fmt.Errorf("error getting cluster config - %w", err)
×
1704
        }
×
1705

1706
        kubeClient, err := kubernetes.NewForConfig(kubeConfig)
×
1707
        if err != nil {
×
1708
                log.Errorf("Error getting k8s client - %v\n", err)
×
1709
                return err
×
1710
        }
×
1711

1712
        // update k8s client for fetching node labels
1713
        c.setKubeClient(kubeClient)
×
1714

×
1715
        err = c.runValidation(false)
×
1716
        if err != nil {
×
1717
                fmt.Println("CC Manager is not ready")
×
1718
                return err
×
1719
        }
×
1720

1721
        // create driver status file
1722
        err = createStatusFile(outputDirFlag + "/" + ccManagerStatusFile)
×
1723
        if err != nil {
×
1724
                return err
×
1725
        }
×
1726
        return nil
×
1727
}
1728

1729
func (c *CCManager) runValidation(silent bool) error {
×
1730
        node, err := getNode(c.ctx, c.kubeClient)
×
1731
        if err != nil {
×
1732
                return fmt.Errorf("unable to fetch node by name %s to check for %s label: %w",
×
1733
                        nodeNameFlag, CCCapableLabelKey, err)
×
1734
        }
×
1735

1736
        // make sure this is a CC capable node
1737
        nodeLabels := node.GetLabels()
×
1738
        if enabled, ok := nodeLabels[CCCapableLabelKey]; !ok || enabled != "true" {
×
1739
                log.Info("Not a CC capable node, skipping CC Manager validation")
×
1740
                return nil
×
1741
        }
×
1742

1743
        // check if the ccManager container is ready
1744
        err = assertCCManagerContainerReady(silent, withWaitFlag)
×
1745
        if err != nil {
×
1746
                return err
×
1747
        }
×
1748
        return nil
×
1749
}
1750

1751
func (c *CCManager) setKubeClient(kubeClient kubernetes.Interface) {
×
1752
        c.kubeClient = kubeClient
×
1753
}
×
1754

1755
// Check that the ccManager container is ready after applying required ccMode
1756
func assertCCManagerContainerReady(silent, withWaitFlag bool) error {
×
1757
        command := shell
×
1758
        args := []string{"-c", "stat /run/nvidia/validations/.cc-manager-ctr-ready"}
×
1759

×
1760
        if withWaitFlag {
×
1761
                return runCommandWithWait(command, args, sleepIntervalSecondsFlag, silent)
×
1762
        }
×
1763

1764
        return runCommand(command, args, silent)
×
1765
}
1766

1767
func (v *VGPUDevices) validate() error {
×
1768
        ctx := v.ctx
×
1769

×
1770
        gpuWorkloadConfig, err := getWorkloadConfig(ctx)
×
1771
        if err != nil {
×
1772
                return fmt.Errorf("error getting gpu workload config: %w", err)
×
1773
        }
×
1774
        log.Infof("GPU workload configuration: %s", gpuWorkloadConfig)
×
1775

×
1776
        err = createStatusFileWithContent(filepath.Join(outputDirFlag, workloadTypeStatusFile), gpuWorkloadConfig+"\n")
×
1777
        if err != nil {
×
1778
                return fmt.Errorf("error updating %s status file: %w", workloadTypeStatusFile, err)
×
1779
        }
×
1780

1781
        if gpuWorkloadConfig != gpuWorkloadConfigVMVgpu {
×
1782
                log.WithFields(log.Fields{
×
1783
                        "gpuWorkloadConfig": gpuWorkloadConfig,
×
1784
                }).Info("vgpu devices not required on the node. Skipping validation.")
×
1785
                return nil
×
1786
        }
×
1787

1788
        // delete status file if already present
1789
        err = deleteStatusFile(outputDirFlag + "/" + vGPUDevicesStatusFile)
×
1790
        if err != nil {
×
1791
                return err
×
1792
        }
×
1793

1794
        err = v.runValidation()
×
1795
        if err != nil {
×
1796
                return err
×
1797
        }
×
1798
        log.Info("Validation completed successfully - vGPU devices present on the host")
×
1799

×
1800
        // create status file
×
1801
        err = createStatusFile(outputDirFlag + "/" + vGPUDevicesStatusFile)
×
1802
        if err != nil {
×
1803
                return err
×
1804
        }
×
1805

1806
        return nil
×
1807
}
1808

1809
func (v *VGPUDevices) runValidation() error {
×
1810
        nvmdev := nvmdev.New()
×
1811
        vGPUDevices, err := nvmdev.GetAllDevices()
×
1812
        if err != nil {
×
1813
                return fmt.Errorf("error checking for vGPU devices on the host: %w", err)
×
1814
        }
×
1815

1816
        if !withWaitFlag {
×
1817
                numDevices := len(vGPUDevices)
×
1818
                if numDevices == 0 {
×
1819
                        return fmt.Errorf("no vGPU devices found")
×
1820
                }
×
1821

1822
                log.Infof("Found %d vGPU devices", numDevices)
×
1823
                return nil
×
1824
        }
1825

1826
        for {
×
1827
                numDevices := len(vGPUDevices)
×
1828
                if numDevices > 0 {
×
1829
                        log.Infof("Found %d vGPU devices", numDevices)
×
1830
                        return nil
×
1831
                }
×
1832
                log.Infof("No vGPU devices found, retrying after %d seconds", sleepIntervalSecondsFlag)
×
1833
                time.Sleep(time.Duration(sleepIntervalSecondsFlag) * time.Second)
×
1834

×
1835
                vGPUDevices, err = nvmdev.GetAllDevices()
×
1836
                if err != nil {
×
1837
                        return fmt.Errorf("error checking for vGPU devices on the host: %w", err)
×
1838
                }
×
1839
        }
1840
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc