• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

NVIDIA / gpu-operator / 18929278744

30 Oct 2025 03:50AM UTC coverage: 22.389% (+0.04%) from 22.345%
18929278744

push

github

rahulait
fix logging for controllers

Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>

0 of 67 new or added lines in 3 files covered. (0.0%)

3 existing lines in 2 files now uncovered.

2622 of 11711 relevant lines covered (22.39%)

0.25 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/controllers/nvidiadriver_controller.go
1
/*
2
Copyright 2021.
3

4
Licensed under the Apache License, Version 2.0 (the "License");
5
you may not use this file except in compliance with the License.
6
You may obtain a copy of the License at
7

8
    http://www.apache.org/licenses/LICENSE-2.0
9

10
Unless required by applicable law or agreed to in writing, software
11
distributed under the License is distributed on an "AS IS" BASIS,
12
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
See the License for the specific language governing permissions and
14
limitations under the License.
15
*/
16

17
package controllers
18

19
import (
20
        "context"
21
        "errors"
22
        "fmt"
23
        "maps"
24
        "time"
25

26
        appsv1 "k8s.io/api/apps/v1"
27
        corev1 "k8s.io/api/core/v1"
28
        apierrors "k8s.io/apimachinery/pkg/api/errors"
29
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30
        "k8s.io/apimachinery/pkg/runtime"
31
        "k8s.io/apimachinery/pkg/types"
32
        "k8s.io/client-go/util/workqueue"
33
        ctrl "sigs.k8s.io/controller-runtime"
34
        "sigs.k8s.io/controller-runtime/pkg/client"
35
        "sigs.k8s.io/controller-runtime/pkg/controller"
36
        "sigs.k8s.io/controller-runtime/pkg/event"
37
        "sigs.k8s.io/controller-runtime/pkg/handler"
38
        "sigs.k8s.io/controller-runtime/pkg/log"
39
        "sigs.k8s.io/controller-runtime/pkg/predicate"
40
        "sigs.k8s.io/controller-runtime/pkg/reconcile"
41
        "sigs.k8s.io/controller-runtime/pkg/source"
42

43
        gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
44
        nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
45
        "github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
46
        "github.com/NVIDIA/gpu-operator/internal/conditions"
47
        "github.com/NVIDIA/gpu-operator/internal/consts"
48
        "github.com/NVIDIA/gpu-operator/internal/state"
49
        "github.com/NVIDIA/gpu-operator/internal/validator"
50
)
51

52
// NVIDIADriverReconciler reconciles a NVIDIADriver object
53
type NVIDIADriverReconciler struct {
54
        client.Client
55
        Scheme      *runtime.Scheme
56
        ClusterInfo clusterinfo.Interface
57
        Namespace   string
58

59
        stateManager          state.Manager
60
        nodeSelectorValidator validator.Validator
61
        conditionUpdater      conditions.Updater
62
}
63

64
//+kubebuilder:rbac:groups=nvidia.com,resources=nvidiadrivers,verbs=get;list;watch;create;update;patch;delete
65
//+kubebuilder:rbac:groups=nvidia.com,resources=nvidiadrivers/status,verbs=get;update;patch
66
//+kubebuilder:rbac:groups=nvidia.com,resources=nvidiadrivers/finalizers,verbs=update
67

68
// Reconcile is part of the main kubernetes reconciliation loop which aims to
69
// move the current state of the cluster closer to the desired state.
70
// TODO(user): Modify the Reconcile function to compare the state specified by
71
// the NVIDIADriver object against the actual cluster state, and then
72
// perform operations to make the cluster state reflect the state specified by
73
// the user.
74
//
75
// For more details, check Reconcile and its Result here:
76
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.8.3/pkg/reconcile
77
func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
×
78
        logger := log.FromContext(ctx)
×
79
        logger.V(consts.LogLevelInfo).Info("Reconciling NVIDIADriver")
×
80

×
81
        // Get the NvidiaDriver instance from this request
×
82
        instance := &nvidiav1alpha1.NVIDIADriver{}
×
NEW
83
        if err := r.Get(ctx, req.NamespacedName, instance); err != nil {
×
84
                if apierrors.IsNotFound(err) {
×
85
                        // Request object not found, could have been deleted after reconcile request.
×
86
                        // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
×
87
                        // Return and don't requeue
×
88
                        return reconcile.Result{}, nil
×
89
                }
×
NEW
90
                wrappedErr := fmt.Errorf("error getting NVIDIADriver object: %w", err)
×
NEW
91
                logger.Error(err, "error getting NVIDIADriver object")
×
92
                instance.Status.State = nvidiav1alpha1.NotReady
×
NEW
93
                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, wrappedErr.Error()); condErr != nil {
×
NEW
94
                        logger.Error(condErr, "failed to set condition")
×
UNCOV
95
                }
×
96
                // Error reading the object - requeue the request.
NEW
97
                return reconcile.Result{}, wrappedErr
×
98
        }
99

100
        // Get the singleton NVIDIA ClusterPolicy object in the cluster.
101
        clusterPolicyList := &gpuv1.ClusterPolicyList{}
×
NEW
102
        if err := r.List(ctx, clusterPolicyList); err != nil {
×
NEW
103
                err = fmt.Errorf("error getting ClusterPolicy list: %w", err)
×
NEW
104
                logger.Error(err, "error getting ClusterPolicy list")
×
105
                instance.Status.State = nvidiav1alpha1.NotReady
×
NEW
106
                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
×
NEW
107
                        logger.Error(condErr, "failed to set condition")
×
108
                }
×
NEW
109
                return reconcile.Result{}, err
×
110
        }
111

112
        if len(clusterPolicyList.Items) == 0 {
×
NEW
113
                err := fmt.Errorf("no ClusterPolicy object found in the cluster")
×
NEW
114
                logger.Error(err, "no ClusterPolicy object found in the cluster")
×
115
                instance.Status.State = nvidiav1alpha1.NotReady
×
NEW
116
                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
×
NEW
117
                        logger.Error(condErr, "failed to set condition")
×
118
                }
×
119
                return reconcile.Result{}, err
×
120
        }
121
        clusterPolicyInstance := clusterPolicyList.Items[0]
×
122

×
123
        // Create a new InfoCatalog which is a generic interface for passing information to state managers
×
124
        infoCatalog := state.NewInfoCatalog()
×
125

×
126
        // Add an entry for ClusterInfo, which was collected before the NVIDIADriver controller was started
×
127
        infoCatalog.Add(state.InfoTypeClusterInfo, r.ClusterInfo)
×
128

×
129
        // Add an entry for Clusterpolicy, which is needed to deploy the driver daemonset
×
130
        infoCatalog.Add(state.InfoTypeClusterPolicyCR, clusterPolicyInstance)
×
131

×
132
        // Verify the nodeSelector configured for this NVIDIADriver instance does
×
133
        // not conflict with any other instances. This ensures only one driver
×
134
        // is deployed per GPU node.
×
NEW
135
        if err := r.nodeSelectorValidator.Validate(ctx, instance); err != nil {
×
NEW
136
                logger.Error(err, "nodeSelector validation failed")
×
NEW
137
                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ConflictingNodeSelector, err.Error()); condErr != nil {
×
NEW
138
                        logger.Error(condErr, "failed to set condition")
×
139
                }
×
140
                return reconcile.Result{}, nil
×
141
        }
142

143
        if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) {
×
NEW
144
                err := errors.New("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers")
×
NEW
145
                logger.Error(err, "unsupported driver combination detected")
×
146
                instance.Status.State = nvidiav1alpha1.NotReady
×
NEW
147
                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
×
NEW
148
                        logger.Error(condErr, "failed to set condition")
×
149
                }
×
150
                return reconcile.Result{}, nil
×
151
        }
152

153
        if instance.Spec.IsGDSEnabled() && instance.Spec.IsOpenKernelModulesRequired() && !instance.Spec.IsOpenKernelModulesEnabled() {
×
NEW
154
                err := fmt.Errorf("GPUDirect Storage driver '%s' is only supported with NVIDIA OpenRM drivers. Please set 'useOpenKernelModules=true' to enable OpenRM mode", instance.Spec.GPUDirectStorage.Version)
×
NEW
155
                logger.Error(err, "unsupported driver combination detected")
×
156
                instance.Status.State = nvidiav1alpha1.NotReady
×
NEW
157
                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
×
NEW
158
                        logger.Error(condErr, "failed to set condition")
×
159
                }
×
160
                return reconcile.Result{}, nil
×
161
        }
162

163
        // ensure that the specified K8s secret actually exists in the operator namespace
164
        secretName := instance.Spec.SecretEnv
×
165
        if len(secretName) > 0 {
×
166
                key := client.ObjectKey{Namespace: r.Namespace, Name: secretName}
×
NEW
167
                if err := r.Get(ctx, key, &corev1.Secret{}); err != nil {
×
NEW
168
                        logger.Error(err, "failed to get secret")
×
NEW
169
                        if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
×
NEW
170
                                logger.Error(condErr, "failed to set condition")
×
171
                        }
×
172
                        return reconcile.Result{}, nil
×
173
                }
174
        }
175

176
        // Sync state and update status
177
        managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog)
×
178

×
179
        // update CR status
×
NEW
180
        if err := r.updateCrStatus(ctx, instance, managerStatus); err != nil {
×
181
                return ctrl.Result{}, err
×
182
        }
×
183

184
        if managerStatus.Status != state.SyncStateReady {
×
185
                logger.Info("NVIDIADriver instance is not ready")
×
186
                var errorInfo error
×
187
                for _, result := range managerStatus.StatesStatus {
×
188
                        if result.Status != state.SyncStateReady && result.ErrInfo != nil {
×
189
                                errorInfo = result.ErrInfo
×
NEW
190
                                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, fmt.Sprintf("Error syncing state %s: %v", result.StateName, errorInfo.Error())); condErr != nil {
×
NEW
191
                                        logger.Error(condErr, "failed to set condition")
×
192
                                }
×
193
                                break
×
194
                        }
195
                }
196
                // if no errors are reported from any state, then we would be waiting on driver daemonset pods
197
                if errorInfo == nil {
×
NEW
198
                        if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.DriverNotReady, "Waiting for driver pod to be ready"); condErr != nil {
×
NEW
199
                                logger.Error(condErr, "failed to set condition")
×
UNCOV
200
                        }
×
201
                }
202
                return reconcile.Result{RequeueAfter: time.Second * 5}, nil
×
203
        }
204

NEW
205
        if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.Reconciled, "All resources have been successfully reconciled"); condErr != nil {
×
NEW
206
                logger.Error(condErr, "failed to set condition")
×
207
                return ctrl.Result{}, condErr
×
208
        }
×
209
        return reconcile.Result{}, nil
×
210
}
211

212
func (r *NVIDIADriverReconciler) updateCrStatus(
213
        ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver, status state.Results) error {
×
214
        reqLogger := log.FromContext(ctx)
×
215

×
216
        // Fetch latest instance and update state to avoid version mismatch
×
217
        instance := &nvidiav1alpha1.NVIDIADriver{}
×
218
        err := r.Get(ctx, types.NamespacedName{Name: cr.Name}, instance)
×
219
        if err != nil {
×
220
                reqLogger.Error(err, "Failed to get NVIDIADriver instance for status update")
×
221
                return err
×
222
        }
×
223

224
        // Update global State
225
        if instance.Status.State == nvidiav1alpha1.State(status.Status) {
×
226
                return nil
×
227
        }
×
228
        instance.Status.State = nvidiav1alpha1.State(status.Status)
×
229

×
230
        // send status update request to k8s API
×
231
        reqLogger.V(consts.LogLevelInfo).Info("Updating CR Status", "Status", instance.Status)
×
232
        err = r.Status().Update(ctx, instance)
×
233
        if err != nil {
×
NEW
234
                reqLogger.Error(err, "Failed to update CR status")
×
235
                return err
×
236
        }
×
237
        return nil
×
238
}
239

240
// SetupWithManager sets up the controller with the Manager.
241
func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
×
242
        // Create state manager
×
243
        stateManager, err := state.NewManager(
×
244
                nvidiav1alpha1.NVIDIADriverCRDName,
×
245
                r.Namespace,
×
246
                mgr.GetClient(),
×
247
                mgr.GetScheme())
×
248
        if err != nil {
×
249
                return fmt.Errorf("error creating state manager: %v", err)
×
250
        }
×
251
        r.stateManager = stateManager
×
252

×
253
        // initialize validators
×
254
        r.nodeSelectorValidator = validator.NewNodeSelectorValidator(r.Client)
×
255

×
256
        // initialize condition updater
×
257
        r.conditionUpdater = conditions.NewNvDriverUpdater(mgr.GetClient())
×
258

×
259
        // Create a new NVIDIADriver controller
×
260
        c, err := controller.New("nvidia-driver-controller", mgr, controller.Options{
×
261
                Reconciler:              r,
×
262
                MaxConcurrentReconciles: 1,
×
263
                RateLimiter:             workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](minDelayCR, maxDelayCR),
×
264
        })
×
265
        if err != nil {
×
266
                return err
×
267
        }
×
268

269
        // Watch for changes to the primary resource NVIDIaDriver
270
        err = c.Watch(source.Kind(
×
271
                mgr.GetCache(),
×
272
                &nvidiav1alpha1.NVIDIADriver{},
×
273
                &handler.TypedEnqueueRequestForObject[*nvidiav1alpha1.NVIDIADriver]{},
×
274
                predicate.TypedGenerationChangedPredicate[*nvidiav1alpha1.NVIDIADriver]{},
×
275
        ),
×
276
        )
×
277
        if err != nil {
×
278
                return err
×
279
        }
×
280

281
        // Watch for changes to ClusterPolicy. Whenever an event is generated for ClusterPolicy, enqueue
282
        // a reconcile request for all NVIDIADriver instances.
283
        mapFn := func(ctx context.Context, cp *gpuv1.ClusterPolicy) []reconcile.Request {
×
284
                logger := log.FromContext(ctx)
×
285
                opts := []client.ListOption{}
×
286
                list := &nvidiav1alpha1.NVIDIADriverList{}
×
287

×
288
                err := mgr.GetClient().List(ctx, list, opts...)
×
289
                if err != nil {
×
290
                        logger.Error(err, "Unable to list NVIDIADriver resources")
×
291
                        return []reconcile.Request{}
×
292
                }
×
293

294
                reconcileRequests := []reconcile.Request{}
×
295
                for _, nvidiaDriver := range list.Items {
×
296
                        reconcileRequests = append(reconcileRequests,
×
297
                                reconcile.Request{
×
298
                                        NamespacedName: types.NamespacedName{
×
299
                                                Name:      nvidiaDriver.GetName(),
×
300
                                                Namespace: nvidiaDriver.GetNamespace(),
×
301
                                        },
×
302
                                })
×
303
                }
×
304

305
                return reconcileRequests
×
306
        }
307

308
        // Watch for changes to the Nodes. Whenever an event is generated for ClusterPolicy, enqueue
309
        // a reconcile request for all NVIDIADriver instances.
310
        nodeMapFn := func(ctx context.Context, cp *corev1.Node) []reconcile.Request {
×
311
                logger := log.FromContext(ctx)
×
312
                opts := []client.ListOption{}
×
313
                list := &nvidiav1alpha1.NVIDIADriverList{}
×
314

×
315
                err := mgr.GetClient().List(ctx, list, opts...)
×
316
                if err != nil {
×
317
                        logger.Error(err, "Unable to list NVIDIADriver resources")
×
318
                        return []reconcile.Request{}
×
319
                }
×
320

321
                reconcileRequests := []reconcile.Request{}
×
322
                for _, nvidiaDriver := range list.Items {
×
323
                        reconcileRequests = append(reconcileRequests,
×
324
                                reconcile.Request{
×
325
                                        NamespacedName: types.NamespacedName{
×
326
                                                Name:      nvidiaDriver.GetName(),
×
327
                                                Namespace: nvidiaDriver.GetNamespace(),
×
328
                                        },
×
329
                                })
×
330
                }
×
331

332
                return reconcileRequests
×
333
        }
334

335
        err = c.Watch(
×
336
                source.Kind(
×
337
                        mgr.GetCache(),
×
338
                        &gpuv1.ClusterPolicy{},
×
339
                        handler.TypedEnqueueRequestsFromMapFunc[*gpuv1.ClusterPolicy](mapFn),
×
340
                        predicate.TypedGenerationChangedPredicate[*gpuv1.ClusterPolicy]{},
×
341
                ),
×
342
        )
×
343
        if err != nil {
×
344
                return err
×
345
        }
×
346

347
        nodePredicate := predicate.TypedFuncs[*corev1.Node]{
×
348
                CreateFunc: func(e event.TypedCreateEvent[*corev1.Node]) bool {
×
349
                        labels := e.Object.GetLabels()
×
350
                        return hasGPULabels(labels)
×
351
                },
×
352
                UpdateFunc: func(e event.TypedUpdateEvent[*corev1.Node]) bool {
×
353
                        logger := log.FromContext(ctx)
×
354
                        newLabels := e.ObjectNew.GetLabels()
×
355
                        oldLabels := e.ObjectOld.GetLabels()
×
356
                        nodeName := e.ObjectNew.GetName()
×
357

×
358
                        needsUpdate := hasGPULabels(newLabels) && !maps.Equal(newLabels, oldLabels)
×
359

×
360
                        if needsUpdate {
×
361
                                logger.Info("Node labels have been changed",
×
362
                                        "name", nodeName,
×
363
                                )
×
364
                        }
×
365
                        return needsUpdate
×
366
                },
367
                DeleteFunc: func(e event.TypedDeleteEvent[*corev1.Node]) bool {
×
368
                        labels := e.Object.GetLabels()
×
369
                        return hasGPULabels(labels)
×
370
                },
×
371
        }
372

373
        // Watch for changes to node labels
374
        err = c.Watch(
×
375
                source.Kind(mgr.GetCache(),
×
376
                        &corev1.Node{},
×
377
                        handler.TypedEnqueueRequestsFromMapFunc[*corev1.Node](nodeMapFn),
×
378
                        nodePredicate,
×
379
                ),
×
380
        )
×
381
        if err != nil {
×
382
                return err
×
383
        }
×
384

385
        // Watch for changes to secondary resources which each state manager manages
386
        watchSources := stateManager.GetWatchSources(mgr)
×
387
        for _, watchSource := range watchSources {
×
388
                err = c.Watch(
×
389
                        watchSource,
×
390
                )
×
391
                if err != nil {
×
392
                        return fmt.Errorf("error setting up Watch for source type %v: %w", watchSource, err)
×
393
                }
×
394
        }
395

396
        // Add an index key which allows our reconciler to quickly look up DaemonSets owned by an NVIDIADriver instance
397
        if err := mgr.GetFieldIndexer().IndexField(ctx, &appsv1.DaemonSet{}, consts.NVIDIADriverControllerIndexKey, func(rawObj client.Object) []string {
×
398
                ds := rawObj.(*appsv1.DaemonSet)
×
399
                owner := metav1.GetControllerOf(ds)
×
400
                if owner == nil {
×
401
                        return nil
×
402
                }
×
403
                if owner.APIVersion != nvidiav1alpha1.SchemeGroupVersion.String() || owner.Kind != nvidiav1alpha1.NVIDIADriverCRDName {
×
404
                        return nil
×
405
                }
×
406
                return []string{owner.Name}
×
407
        }); err != nil {
×
408
                return fmt.Errorf("failed to add index key: %w", err)
×
409
        }
×
410

411
        return nil
×
412
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc