• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

NVIDIA / gpu-operator / 18929278744

30 Oct 2025 03:50AM UTC coverage: 22.389% (+0.04%) from 22.345%
18929278744

push

github

rahulait
fix logging for controllers

Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>

0 of 67 new or added lines in 3 files covered. (0.0%)

3 existing lines in 2 files now uncovered.

2622 of 11711 relevant lines covered (22.39%)

0.25 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/controllers/upgrade_controller.go
1
/*
2
Copyright 2022 NVIDIA CORPORATION & AFFILIATES
3

4
Licensed under the Apache License, Version 2.0 (the "License");
5
you may not use this file except in compliance with the License.
6
You may obtain a copy of the License at
7

8
    http://www.apache.org/licenses/LICENSE-2.0
9

10
Unless required by applicable law or agreed to in writing, software
11
distributed under the License is distributed on an "AS IS" BASIS,
12
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
See the License for the specific language governing permissions and
14
limitations under the License.
15
*/
16

17
package controllers
18

19
import (
20
        "context"
21
        "fmt"
22
        "time"
23

24
        corev1 "k8s.io/api/core/v1"
25
        apierrors "k8s.io/apimachinery/pkg/api/errors"
26
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27
        "k8s.io/apimachinery/pkg/labels"
28
        "k8s.io/apimachinery/pkg/types"
29
        "k8s.io/apimachinery/pkg/util/intstr"
30
        "k8s.io/client-go/util/workqueue"
31
        "sigs.k8s.io/controller-runtime/pkg/controller"
32
        "sigs.k8s.io/controller-runtime/pkg/handler"
33
        "sigs.k8s.io/controller-runtime/pkg/log"
34
        "sigs.k8s.io/controller-runtime/pkg/predicate"
35
        "sigs.k8s.io/controller-runtime/pkg/reconcile"
36
        "sigs.k8s.io/controller-runtime/pkg/source"
37

38
        "github.com/NVIDIA/k8s-operator-libs/pkg/consts"
39
        "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade"
40
        "github.com/go-logr/logr"
41
        appsv1 "k8s.io/api/apps/v1"
42
        "k8s.io/apimachinery/pkg/runtime"
43
        ctrl "sigs.k8s.io/controller-runtime"
44
        "sigs.k8s.io/controller-runtime/pkg/client"
45

46
        gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
47
        nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
48
)
49

50
// UpgradeReconciler reconciles Driver Daemon Sets for upgrade
51
type UpgradeReconciler struct {
52
        client.Client
53
        Log          logr.Logger
54
        Scheme       *runtime.Scheme
55
        StateManager upgrade.ClusterUpgradeStateManager
56
}
57

58
const (
59
        plannedRequeueInterval = time.Minute * 2
60
        // DriverLabelKey indicates pod label key of the driver
61
        DriverLabelKey = "app"
62
        // DriverLabelValue indicates pod label value of the driver
63
        DriverLabelValue = "nvidia-driver-daemonset"
64
        // UpgradeSkipDrainLabelSelector indicates the pod selector label to skip with drain
65
        UpgradeSkipDrainLabelSelector = "nvidia.com/gpu-driver-upgrade-drain.skip!=true"
66
        // AppComponentLabelKey indicates the label key of the component
67
        AppComponentLabelKey = "app.kubernetes.io/component"
68
        // AppComponentLabelValue indicates the label values of the nvidia-gpu-driver component
69
        AppComponentLabelValue = "nvidia-driver"
70
)
71

72
//nolint
73
// +kubebuilder:rbac:groups=mellanox.com,resources=*,verbs=get;list;watch;create;update;patch;delete
74
// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;update;patch
75
// +kubebuilder:rbac:groups="",resources=pods,verbs=list
76
// +kubebuilder:rbac:groups=apps,resources=deployments;daemonsets;replicasets;statefulsets,verbs=get;list;watch;create;update;patch;delete
77
// +kubebuilder:rbac:groups=apps,resources=deployments/finalizers,verbs=update
78

79
// Reconcile is part of the main kubernetes reconciliation loop which aims to
80
// move the current state of the cluster closer to the desired state.
81
func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
×
82
        reqLogger := r.Log.WithValues("upgrade", req.NamespacedName)
×
83
        reqLogger.V(consts.LogLevelInfo).Info("Reconciling Upgrade")
×
84

×
85
        // Fetch the ClusterPolicy instance
×
86
        clusterPolicy := &gpuv1.ClusterPolicy{}
×
87
        err := r.Get(ctx, req.NamespacedName, clusterPolicy)
×
88
        if err != nil {
×
NEW
89
                reqLogger.Error(err, "Error getting ClusterPolicy object")
×
90
                if clusterPolicyCtrl.operatorMetrics != nil {
×
91
                        clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable)
×
92
                }
×
93
                if apierrors.IsNotFound(err) {
×
94
                        // Request object not found, could have been deleted after reconcile request.
×
95
                        // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
×
96
                        // Return and don't requeue
×
97
                        return reconcile.Result{}, nil
×
98
                }
×
99
                // Error reading the object - requeue the request.
100
                return reconcile.Result{}, err
×
101
        }
102

103
        if clusterPolicy.Spec.SandboxWorkloads.IsEnabled() {
×
104
                reqLogger.V(consts.LogLevelInfo).Info("Advanced driver upgrade policy is not supported when 'sandboxWorkloads.enabled=true'" +
×
105
                        "in ClusterPolicy, cleaning up upgrade state and skipping reconciliation")
×
106
                // disable driver upgrade metrics
×
107
                if clusterPolicyCtrl.operatorMetrics != nil {
×
108
                        clusterPolicyCtrl.operatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled)
×
109
                }
×
110
                return ctrl.Result{}, r.removeNodeUpgradeStateLabels(ctx)
×
111
        }
112

113
        if clusterPolicy.Spec.Driver.UpgradePolicy == nil ||
×
114
                !clusterPolicy.Spec.Driver.UpgradePolicy.AutoUpgrade {
×
115
                reqLogger.V(consts.LogLevelInfo).Info("Advanced driver upgrade policy is disabled, cleaning up upgrade state and skipping reconciliation")
×
116
                // disable driver upgrade metrics
×
117
                if clusterPolicyCtrl.operatorMetrics != nil {
×
118
                        clusterPolicyCtrl.operatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled)
×
119
                }
×
120
                return ctrl.Result{}, r.removeNodeUpgradeStateLabels(ctx)
×
121
        }
122
        // enable driver upgrade metrics
123
        if clusterPolicyCtrl.operatorMetrics != nil {
×
124
                clusterPolicyCtrl.operatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeEnabled)
×
125
        }
×
126

127
        var driverLabel map[string]string
×
128

×
129
        // initialize with common app=nvidia-driver-daemonset label
×
130
        driverLabelKey := DriverLabelKey
×
131
        driverLabelValue := DriverLabelValue
×
132

×
133
        if clusterPolicy.Spec.Driver.UseNvdiaDriverCRDType() {
×
134
                // app component label is added for all new driver daemonsets deployed by NVIDIADriver controller
×
135
                driverLabelKey = AppComponentLabelKey
×
136
                driverLabelValue = AppComponentLabelValue
×
137
        } else if clusterPolicyCtrl.openshift != "" && clusterPolicyCtrl.ocpDriverToolkit.enabled {
×
138
                // For OCP, when DTK is enabled app=nvidia-driver-daemonset label is not constant and changes
×
139
                // based on rhcos version. Hence use DTK label instead
×
140
                driverLabelKey = ocpDriverToolkitIdentificationLabel
×
141
                driverLabelValue = ocpDriverToolkitIdentificationValue
×
142
        }
×
143

144
        driverLabel = map[string]string{driverLabelKey: driverLabelValue}
×
145
        reqLogger.Info("Using label selector", "key", driverLabelKey, "value", driverLabelValue)
×
146

×
147
        state, err := r.StateManager.BuildState(ctx, clusterPolicyCtrl.operatorNamespace,
×
148
                driverLabel)
×
149
        if err != nil {
×
150
                r.Log.Error(err, "Failed to build cluster upgrade state")
×
151
                return ctrl.Result{}, err
×
152
        }
×
153

154
        reqLogger.Info("Propagate state to state manager")
×
155
        reqLogger.V(consts.LogLevelDebug).Info("Current cluster upgrade state", "state", state)
×
156

×
157
        totalNodes := r.StateManager.GetTotalManagedNodes(state)
×
158
        maxUnavailable := totalNodes
×
159
        if clusterPolicy.Spec.Driver.UpgradePolicy != nil && clusterPolicy.Spec.Driver.UpgradePolicy.MaxUnavailable != nil {
×
160
                maxUnavailable, err = intstr.GetScaledValueFromIntOrPercent(clusterPolicy.Spec.Driver.UpgradePolicy.MaxUnavailable, totalNodes, true)
×
161
                if err != nil {
×
162
                        r.Log.Error(err, "Failed to compute maxUnavailable from the current total nodes")
×
163
                        return ctrl.Result{}, err
×
164
                }
×
165
        }
166

167
        // We want to skip operator itself during the drain because the upgrade process might hang
168
        // if the operator is evicted and can't be rescheduled to any other node, e.g. in a single-node cluster.
169
        // It's safe to do because the goal of the node draining during the upgrade is to
170
        // evict pods that might use driver and operator doesn't use in its own pod.
171
        if clusterPolicy.Spec.Driver.UpgradePolicy.DrainSpec.PodSelector == "" {
×
172
                clusterPolicy.Spec.Driver.UpgradePolicy.DrainSpec.PodSelector = UpgradeSkipDrainLabelSelector
×
173
        } else {
×
174
                clusterPolicy.Spec.Driver.UpgradePolicy.DrainSpec.PodSelector =
×
175
                        fmt.Sprintf("%s,%s", clusterPolicy.Spec.Driver.UpgradePolicy.DrainSpec.PodSelector, UpgradeSkipDrainLabelSelector)
×
176
        }
×
177

178
        // log metrics with the current state
179
        if clusterPolicyCtrl.operatorMetrics != nil {
×
180
                clusterPolicyCtrl.operatorMetrics.upgradesInProgress.Set(float64(r.StateManager.GetUpgradesInProgress(state)))
×
181
                clusterPolicyCtrl.operatorMetrics.upgradesDone.Set(float64(r.StateManager.GetUpgradesDone(state)))
×
182
                clusterPolicyCtrl.operatorMetrics.upgradesAvailable.Set(float64(r.StateManager.GetUpgradesAvailable(state, clusterPolicy.Spec.Driver.UpgradePolicy.MaxParallelUpgrades, maxUnavailable)))
×
183
                clusterPolicyCtrl.operatorMetrics.upgradesFailed.Set(float64(r.StateManager.GetUpgradesFailed(state)))
×
184
                clusterPolicyCtrl.operatorMetrics.upgradesPending.Set(float64(r.StateManager.GetUpgradesPending(state)))
×
185
        }
×
186

187
        err = r.StateManager.ApplyState(ctx, state, clusterPolicy.Spec.Driver.UpgradePolicy)
×
188
        if err != nil {
×
189
                r.Log.Error(err, "Failed to apply cluster upgrade state")
×
190
                return ctrl.Result{}, err
×
191
        }
×
192

193
        // In some cases if node state changes fail to apply, upgrade process
194
        // might become stuck until the new reconcile loop is scheduled.
195
        // Since node/ds/clusterpolicy updates from outside of the upgrade flow
196
        // are not guaranteed, for safety reconcile loop should be requeued every few minutes.
197
        return ctrl.Result{Requeue: true, RequeueAfter: plannedRequeueInterval}, nil
×
198
}
199

200
// removeNodeUpgradeStateLabels loops over nodes in the cluster and removes "nvidia.com/gpu-driver-upgrade-state"
201
// It is used for cleanup when autoUpgrade feature gets disabled
202
func (r *UpgradeReconciler) removeNodeUpgradeStateLabels(ctx context.Context) error {
×
203
        r.Log.Info("Resetting node upgrade labels from all nodes")
×
204

×
205
        nodeList := &corev1.NodeList{}
×
206
        err := r.List(ctx, nodeList)
×
207
        if err != nil {
×
208
                r.Log.Error(err, "Failed to get node list to reset upgrade labels")
×
209
                return err
×
210
        }
×
211

212
        upgradeStateLabel := upgrade.GetUpgradeStateLabelKey()
×
213

×
214
        for i := range nodeList.Items {
×
215
                node := &nodeList.Items[i]
×
216
                _, present := node.Labels[upgradeStateLabel]
×
217
                if present {
×
218
                        delete(node.Labels, upgradeStateLabel)
×
219
                        err = r.Update(ctx, node)
×
220
                        if err != nil {
×
NEW
221
                                r.Log.Error(
×
222
                                        err, "Failed to reset upgrade state label from node", "node", node)
×
223
                                return err
×
224
                        }
×
225
                }
226
        }
227
        return nil
×
228
}
229

230
// SetupWithManager sets up the controller with the Manager.
231
//
232
//nolint:dupl
233
func (r *UpgradeReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
×
234
        // Create a new controller
×
235
        c, err := controller.New("upgrade-controller", mgr, controller.Options{Reconciler: r, MaxConcurrentReconciles: 1,
×
236
                RateLimiter: workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](minDelayCR, maxDelayCR)})
×
237
        if err != nil {
×
238
                return err
×
239
        }
×
240

241
        // Watch for changes to primary resource ClusterPolicy
242
        err = c.Watch(source.Kind(
×
243
                mgr.GetCache(),
×
244
                &gpuv1.ClusterPolicy{},
×
245
                &handler.TypedEnqueueRequestForObject[*gpuv1.ClusterPolicy]{},
×
246
                predicate.TypedGenerationChangedPredicate[*gpuv1.ClusterPolicy]{}),
×
247
        )
×
248
        if err != nil {
×
249
                return err
×
250
        }
×
251

252
        // Define a mapping from the Node object in the event to one or more
253
        // ClusterPolicy objects to Reconcile
254
        nodeMapFn := func(ctx context.Context, o *corev1.Node) []reconcile.Request {
×
255
                return getClusterPoliciesToReconcile(ctx, mgr.GetClient())
×
256
        }
×
257

258
        // Watch for changes to node labels
259
        // TODO: only watch for changes to upgrade state label
260
        err = c.Watch(
×
261
                source.Kind(
×
262
                        mgr.GetCache(),
×
263
                        &corev1.Node{},
×
264
                        handler.TypedEnqueueRequestsFromMapFunc[*corev1.Node](nodeMapFn),
×
265
                        predicate.TypedLabelChangedPredicate[*corev1.Node]{},
×
266
                ),
×
267
        )
×
268
        if err != nil {
×
269
                return err
×
270
        }
×
271

272
        // Define a mapping between the DaemonSet object in the event
273
        // to one or more ClusterPolicy instances to reconcile.
274
        //
275
        // For events generated by DaemonSets, ensure the object is
276
        // owned by either ClusterPolicy or NVIDIADriver.
277
        dsMapFn := func(ctx context.Context, a *appsv1.DaemonSet) []reconcile.Request {
×
278
                ownerRefs := a.GetOwnerReferences()
×
279

×
280
                ownedByNVIDIA := false
×
281
                for _, owner := range ownerRefs {
×
282
                        if (owner.APIVersion == gpuv1.SchemeGroupVersion.String() && owner.Kind == "ClusterPolicy") ||
×
283
                                (owner.APIVersion == nvidiav1alpha1.SchemeGroupVersion.String() && owner.Kind == "NVIDIADriver") {
×
284
                                ownedByNVIDIA = true
×
285
                                break
×
286
                        }
287
                }
288

289
                if !ownedByNVIDIA {
×
290
                        return nil
×
291
                }
×
292

293
                return getClusterPoliciesToReconcile(ctx, mgr.GetClient())
×
294
        }
295

296
        // Watch for changes to NVIDIA driver daemonsets and enqueue ClusterPolicy
297
        // TODO: use one common label to identify all NVIDIA driver DaemonSets
298
        appLabelSelector := predicate.NewTypedPredicateFuncs(func(ds *appsv1.DaemonSet) bool {
×
299
                ls := metav1.LabelSelector{MatchLabels: map[string]string{DriverLabelKey: DriverLabelValue}}
×
300
                selector, _ := metav1.LabelSelectorAsSelector(&ls)
×
301
                return selector.Matches(labels.Set(ds.GetLabels()))
×
302
        })
×
303

304
        dtkLabelSelector := predicate.NewTypedPredicateFuncs(func(ds *appsv1.DaemonSet) bool {
×
305
                ls := metav1.LabelSelector{MatchLabels: map[string]string{ocpDriverToolkitIdentificationLabel: ocpDriverToolkitIdentificationValue}}
×
306
                selector, _ := metav1.LabelSelectorAsSelector(&ls)
×
307
                return selector.Matches(labels.Set(ds.GetLabels()))
×
308
        })
×
309

310
        componentLabelSelector := predicate.NewTypedPredicateFuncs(func(ds *appsv1.DaemonSet) bool {
×
311
                ls := metav1.LabelSelector{MatchLabels: map[string]string{AppComponentLabelKey: AppComponentLabelValue}}
×
312
                selector, _ := metav1.LabelSelectorAsSelector(&ls)
×
313
                return selector.Matches(labels.Set(ds.GetLabels()))
×
314
        })
×
315

316
        err = c.Watch(
×
317
                source.Kind(
×
318
                        mgr.GetCache(),
×
319
                        &appsv1.DaemonSet{},
×
320
                        handler.TypedEnqueueRequestsFromMapFunc[*appsv1.DaemonSet](dsMapFn),
×
321
                        predicate.And[*appsv1.DaemonSet](
×
322
                                predicate.TypedGenerationChangedPredicate[*appsv1.DaemonSet]{},
×
323
                                predicate.Or[*appsv1.DaemonSet](appLabelSelector, dtkLabelSelector, componentLabelSelector),
×
324
                        ),
×
325
                ))
×
326
        if err != nil {
×
327
                return err
×
328
        }
×
329

330
        return nil
×
331
}
332

333
func getClusterPoliciesToReconcile(ctx context.Context, k8sClient client.Client) []reconcile.Request {
×
334
        logger := log.FromContext(ctx)
×
335
        opts := []client.ListOption{}
×
336
        list := &gpuv1.ClusterPolicyList{}
×
337

×
338
        err := k8sClient.List(ctx, list, opts...)
×
339
        if err != nil {
×
340
                logger.Error(err, "Unable to list ClusterPolicies")
×
341
                return []reconcile.Request{}
×
342
        }
×
343

344
        cpToRec := []reconcile.Request{}
×
345

×
346
        for _, cp := range list.Items {
×
347
                cpToRec = append(cpToRec, reconcile.Request{NamespacedName: types.NamespacedName{
×
348
                        Name:      cp.GetName(),
×
349
                        Namespace: cp.GetNamespace(),
×
350
                }})
×
351
        }
×
352

353
        return cpToRec
×
354
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc