• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

NVIDIA / gpu-operator / 18929278744

30 Oct 2025 03:50AM UTC coverage: 22.389% (+0.04%) from 22.345%
18929278744

push

github

rahulait
fix logging for controllers

Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>

0 of 67 new or added lines in 3 files covered. (0.0%)

3 existing lines in 2 files now uncovered.

2622 of 11711 relevant lines covered (22.39%)

0.25 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/controllers/clusterpolicy_controller.go
1
/*
2
Copyright 2021.
3

4
Licensed under the Apache License, Version 2.0 (the "License");
5
you may not use this file except in compliance with the License.
6
You may obtain a copy of the License at
7

8
    http://www.apache.org/licenses/LICENSE-2.0
9

10
Unless required by applicable law or agreed to in writing, software
11
distributed under the License is distributed on an "AS IS" BASIS,
12
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
See the License for the specific language governing permissions and
14
limitations under the License.
15
*/
16

17
package controllers
18

19
import (
20
        "context"
21
        "fmt"
22

23
        "github.com/go-logr/logr"
24

25
        appsv1 "k8s.io/api/apps/v1"
26
        corev1 "k8s.io/api/core/v1"
27
        apierrors "k8s.io/apimachinery/pkg/api/errors"
28
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29
        "k8s.io/apimachinery/pkg/runtime"
30
        "k8s.io/apimachinery/pkg/types"
31
        "k8s.io/client-go/util/workqueue"
32

33
        "time"
34

35
        ctrl "sigs.k8s.io/controller-runtime"
36
        "sigs.k8s.io/controller-runtime/pkg/client"
37
        "sigs.k8s.io/controller-runtime/pkg/controller"
38
        "sigs.k8s.io/controller-runtime/pkg/event"
39
        "sigs.k8s.io/controller-runtime/pkg/handler"
40
        "sigs.k8s.io/controller-runtime/pkg/predicate"
41
        "sigs.k8s.io/controller-runtime/pkg/reconcile"
42
        "sigs.k8s.io/controller-runtime/pkg/source"
43

44
        gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
45
        "github.com/NVIDIA/gpu-operator/internal/conditions"
46
)
47

48
const (
49
        minDelayCR                      = 100 * time.Millisecond
50
        maxDelayCR                      = 3 * time.Second
51
        clusterPolicyControllerIndexKey = "metadata.nvidia.clusterpolicy.controller"
52
)
53

54
// blank assignment to verify that ReconcileClusterPolicy implements reconcile.Reconciler
55
var _ reconcile.Reconciler = &ClusterPolicyReconciler{}
56
var clusterPolicyCtrl ClusterPolicyController
57

58
// ClusterPolicyReconciler reconciles a ClusterPolicy object
59
type ClusterPolicyReconciler struct {
60
        client.Client
61
        Log              logr.Logger
62
        Scheme           *runtime.Scheme
63
        Namespace        string
64
        conditionUpdater conditions.Updater
65
}
66

67
// +kubebuilder:rbac:groups=nvidia.com,resources=*,verbs=get;list;watch;create;update;patch;delete
68
// +kubebuilder:rbac:groups=config.openshift.io,resources=clusterversions;proxies,verbs=get;list;watch
69
// +kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=get;list;watch;create;update;patch;delete
70
// +kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=use,resourceNames=privileged
71
// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterroles;clusterrolebindings;roles;rolebindings,verbs=*
72
// +kubebuilder:rbac:groups="",resources=namespaces;serviceaccounts;pods;pods/eviction;services;services/finalizers;endpoints,verbs=get;list;watch;create;update;patch;delete
73
// +kubebuilder:rbac:groups="",resources=persistentvolumeclaims;events;configmaps;secrets;nodes,verbs=get;list;watch;create;update;patch;delete
74
// +kubebuilder:rbac:groups=apps,resources=deployments;daemonsets;replicasets;statefulsets,verbs=get;list;watch;create;update;patch;delete
75
// +kubebuilder:rbac:groups=apps,resources=controllerrevisions,verbs=get;list;watch
76
// +kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors;prometheusrules,verbs=get;list;watch;create;update;patch;delete
77
// +kubebuilder:rbac:groups=scheduling.k8s.io,resources=priorityclasses,verbs=get;list;watch;create
78
// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch
79
// +kubebuilder:rbac:groups=route.openshift.io,resources=routes,verbs=get;list;watch;create;update;patch
80
// +kubebuilder:rbac:groups=image.openshift.io,resources=imagestreams,verbs=get;list;watch
81
// +kubebuilder:rbac:groups=node.k8s.io,resources=runtimeclasses,verbs=get;list;create;update;watch;delete
82
// +kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=get;list;watch
83

84
// Reconcile is part of the main kubernetes reconciliation loop which aims to
85
// move the current state of the cluster closer to the desired state.
86
// TODO(user): Modify the Reconcile function to compare the state specified by
87
// the ClusterPolicy object against the actual cluster state, and then
88
// perform operations to make the cluster state reflect the state specified by
89
// the user.
90
//
91
// For more details, check Reconcile and its Result here:
92
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.7.0/pkg/reconcile
93
func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
×
94
        _ = r.Log.WithValues("Reconciling ClusterPolicy", req.NamespacedName)
×
95

×
96
        // Fetch the ClusterPolicy instance
×
97
        instance := &gpuv1.ClusterPolicy{}
×
NEW
98
        if err := r.Get(ctx, req.NamespacedName, instance); err != nil {
×
NEW
99
                err = fmt.Errorf("failed to get ClusterPolicy object: %w", err)
×
NEW
100
                r.Log.Error(err, "unable to fetch ClusterPolicy")
×
101
                clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable)
×
102
                if apierrors.IsNotFound(err) {
×
103
                        // Request object not found, could have been deleted after reconcile request.
×
104
                        // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
×
105
                        // Return and don't requeue
×
106
                        return reconcile.Result{}, nil
×
107
                }
×
108
                // Error reading the object - requeue the request.
NEW
109
                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
×
NEW
110
                        r.Log.Error(condErr, "failed to set condition")
×
111
                }
×
112
                return reconcile.Result{}, err
×
113
        }
114

115
        // TODO: Handle deletion of the main ClusterPolicy and cycle to the next one.
116
        // We already have a main Clusterpolicy
117
        if clusterPolicyCtrl.singleton != nil && clusterPolicyCtrl.singleton.Name != instance.Name {
×
118
                instance.SetStatus(gpuv1.Ignored, clusterPolicyCtrl.operatorNamespace)
×
119
                // do not change `clusterPolicyCtrl.operatorMetrics.reconciliationStatus` here,
×
120
                // spurious reconciliation
×
NEW
121
                return ctrl.Result{}, nil
×
122
        }
×
123

NEW
124
        if err := clusterPolicyCtrl.init(ctx, r, instance); err != nil {
×
NEW
125
                r.Log.Error(err, "unable to initialize ClusterPolicy controller")
×
NEW
126
                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
×
NEW
127
                        r.Log.Error(condErr, "failed to set condition")
×
128
                }
×
129
                if clusterPolicyCtrl.operatorMetrics != nil {
×
130
                        clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable)
×
131
                }
×
132
                return ctrl.Result{}, err
×
133
        }
134

135
        if !clusterPolicyCtrl.hasNFDLabels {
×
136
                r.Log.Info("WARNING: NFD labels missing in the cluster, GPU nodes cannot be discovered.")
×
137
                clusterPolicyCtrl.operatorMetrics.reconciliationHasNFDLabels.Set(0)
×
138
        } else {
×
139
                clusterPolicyCtrl.operatorMetrics.reconciliationHasNFDLabels.Set(1)
×
140
        }
×
141
        if !clusterPolicyCtrl.hasGPUNodes {
×
142
                r.Log.Info("No GPU node can be found in the cluster.")
×
143
        }
×
144

145
        clusterPolicyCtrl.operatorMetrics.reconciliationTotal.Inc()
×
146
        overallStatus := gpuv1.Ready
×
147
        statesNotReady := []string{}
×
148
        for {
×
149
                status, statusError := clusterPolicyCtrl.step()
×
150
                if statusError != nil {
×
151
                        clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady)
×
152
                        clusterPolicyCtrl.operatorMetrics.reconciliationFailed.Inc()
×
153
                        updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady)
×
NEW
154
                        if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, fmt.Sprintf("Failed to reconcile %s: %s", clusterPolicyCtrl.stateNames[clusterPolicyCtrl.idx], statusError.Error())); condErr != nil {
×
NEW
155
                                r.Log.Error(condErr, "failed to set condition")
×
156
                        }
×
NEW
157
                        return ctrl.Result{}, statusError
×
158
                }
159

160
                if status == gpuv1.NotReady {
×
161
                        overallStatus = gpuv1.NotReady
×
162
                        statesNotReady = append(statesNotReady, clusterPolicyCtrl.stateNames[clusterPolicyCtrl.idx-1])
×
163
                }
×
164
                r.Log.Info("ClusterPolicy step completed",
×
165
                        "state:", clusterPolicyCtrl.stateNames[clusterPolicyCtrl.idx-1],
×
166
                        "status", status)
×
167

×
168
                if clusterPolicyCtrl.last() {
×
169
                        break
×
170
                }
171
        }
172

173
        // if any state is not ready, requeue for reconcile after 5 seconds
174
        if overallStatus != gpuv1.Ready {
×
175
                clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady)
×
176
                clusterPolicyCtrl.operatorMetrics.reconciliationFailed.Inc()
×
177

×
NEW
178
                err := fmt.Errorf("ClusterPolicy is not ready, states not ready: %v", statesNotReady)
×
NEW
179
                r.Log.Error(err, "ClusterPolicy not yet ready")
×
180
                updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady)
×
NEW
181
                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.OperandNotReady, err.Error()); condErr != nil {
×
NEW
182
                        r.Log.Error(condErr, "failed to set condition")
×
183
                }
×
184
                return ctrl.Result{RequeueAfter: time.Second * 5}, nil
×
185
        }
186

187
        if !clusterPolicyCtrl.hasNFDLabels {
×
188
                // no NFD-labelled node in the cluster (required dependency),
×
189
                // watch periodically for the labels to appear
×
190
                var requeueAfter = time.Second * 45
×
191
                r.Log.Info("No NFD label found, polling for new nodes.",
×
192
                        "requeueAfter", requeueAfter)
×
193

×
194
                // Update CR state as ready as all states are complete
×
195
                updateCRState(ctx, r, req.NamespacedName, gpuv1.Ready)
×
NEW
196
                if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.NFDLabelsMissing, "No NFD labels found"); condErr != nil {
×
NEW
197
                        r.Log.Error(condErr, "failed to set condition")
×
UNCOV
198
                }
×
199

200
                clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusSuccess)
×
201

×
202
                return ctrl.Result{RequeueAfter: requeueAfter}, nil
×
203
        }
204

205
        // Update CR state as ready as all states are complete
206
        updateCRState(ctx, r, req.NamespacedName, gpuv1.Ready)
×
207
        clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusSuccess)
×
208
        clusterPolicyCtrl.operatorMetrics.reconciliationLastSuccess.Set(float64(time.Now().Unix()))
×
209

×
210
        var infoStr string
×
211
        if !clusterPolicyCtrl.hasGPUNodes {
×
212
                infoStr = "No GPU node found, watching for new nodes to join the cluster."
×
213
                r.Log.Info(infoStr, "hasNFDLabels", clusterPolicyCtrl.hasNFDLabels)
×
NEW
214
                if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.NoGPUNodes, infoStr); condErr != nil {
×
NEW
215
                        r.Log.Error(condErr, "failed to set condition")
×
216
                        return ctrl.Result{}, condErr
×
217
                }
×
218
        } else {
×
219
                infoStr = "ClusterPolicy is ready as all resources have been successfully reconciled"
×
220
                r.Log.Info(infoStr)
×
NEW
221
                if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.Reconciled, infoStr); condErr != nil {
×
NEW
222
                        r.Log.Error(condErr, "failed to set condition")
×
223
                        return ctrl.Result{}, condErr
×
224
                }
×
225
        }
226
        return ctrl.Result{}, nil
×
227
}
228

229
func updateCRState(ctx context.Context, r *ClusterPolicyReconciler, namespacedName types.NamespacedName, state gpuv1.State) {
×
230
        // Fetch latest instance and update state to avoid version mismatch
×
231
        instance := &gpuv1.ClusterPolicy{}
×
NEW
232
        if err := r.Get(ctx, namespacedName, instance); err != nil {
×
233
                r.Log.Error(err, "Failed to get ClusterPolicy instance for status update")
×
234
        }
×
235
        if instance.Status.State == state {
×
236
                // state is unchanged
×
237
                return
×
238
        }
×
239
        // Update the CR state
240
        instance.SetStatus(state, clusterPolicyCtrl.operatorNamespace)
×
NEW
241
        if err := r.Client.Status().Update(ctx, instance); err != nil {
×
242
                r.Log.Error(err, "Failed to update ClusterPolicy status")
×
243
        }
×
244
}
245

246
func addWatchNewGPUNode(r *ClusterPolicyReconciler, c controller.Controller, mgr ctrl.Manager) error {
×
247
        // Define a mapping from the Node object in the event to one or more
×
248
        // ClusterPolicy objects to Reconcile
×
249
        mapFn := func(ctx context.Context, n *corev1.Node) []reconcile.Request {
×
250
                // find all the ClusterPolicy to trigger their reconciliation
×
251
                opts := []client.ListOption{} // Namespace = "" to list across all namespaces.
×
252
                list := &gpuv1.ClusterPolicyList{}
×
253

×
254
                err := r.List(ctx, list, opts...)
×
255
                if err != nil {
×
256
                        r.Log.Error(err, "Unable to list ClusterPolicies")
×
257
                        return []reconcile.Request{}
×
258
                }
×
259

260
                cpToRec := []reconcile.Request{}
×
261

×
262
                for _, cp := range list.Items {
×
263
                        cpToRec = append(cpToRec, reconcile.Request{NamespacedName: types.NamespacedName{
×
264
                                Name:      cp.GetName(),
×
265
                                Namespace: cp.GetNamespace(),
×
266
                        }})
×
267
                }
×
268
                r.Log.Info("Reconciliate ClusterPolicies after node label update", "nb", len(cpToRec))
×
269

×
270
                return cpToRec
×
271
        }
272

273
        p := predicate.TypedFuncs[*corev1.Node]{
×
274
                CreateFunc: func(e event.TypedCreateEvent[*corev1.Node]) bool {
×
275
                        labels := e.Object.GetLabels()
×
276

×
277
                        return hasGPULabels(labels)
×
278
                },
×
279
                UpdateFunc: func(e event.TypedUpdateEvent[*corev1.Node]) bool {
×
280
                        newLabels := e.ObjectNew.GetLabels()
×
281
                        oldLabels := e.ObjectOld.GetLabels()
×
282
                        nodeName := e.ObjectNew.GetName()
×
283

×
284
                        gpuCommonLabelMissing := hasGPULabels(newLabels) && !hasCommonGPULabel(newLabels)
×
285
                        gpuCommonLabelOutdated := !hasGPULabels(newLabels) && hasCommonGPULabel(newLabels)
×
286
                        migManagerLabelMissing := hasMIGCapableGPU(newLabels) && !hasMIGManagerLabel(newLabels)
×
287
                        commonOperandsLabelChanged := hasOperandsDisabled(oldLabels) != hasOperandsDisabled(newLabels)
×
288

×
289
                        oldGPUWorkloadConfig, _ := getWorkloadConfig(oldLabels, true)
×
290
                        newGPUWorkloadConfig, _ := getWorkloadConfig(newLabels, true)
×
291
                        gpuWorkloadConfigLabelChanged := oldGPUWorkloadConfig != newGPUWorkloadConfig
×
292

×
293
                        oldOSTreeLabel := oldLabels[nfdOSTreeVersionLabelKey]
×
294
                        newOSTreeLabel := newLabels[nfdOSTreeVersionLabelKey]
×
295
                        osTreeLabelChanged := oldOSTreeLabel != newOSTreeLabel
×
296

×
297
                        needsUpdate := gpuCommonLabelMissing ||
×
298
                                gpuCommonLabelOutdated ||
×
299
                                migManagerLabelMissing ||
×
300
                                commonOperandsLabelChanged ||
×
301
                                gpuWorkloadConfigLabelChanged ||
×
302
                                osTreeLabelChanged
×
303

×
304
                        if needsUpdate {
×
305
                                r.Log.Info("Node needs an update",
×
306
                                        "name", nodeName,
×
307
                                        "gpuCommonLabelMissing", gpuCommonLabelMissing,
×
308
                                        "gpuCommonLabelOutdated", gpuCommonLabelOutdated,
×
309
                                        "migManagerLabelMissing", migManagerLabelMissing,
×
310
                                        "commonOperandsLabelChanged", commonOperandsLabelChanged,
×
311
                                        "gpuWorkloadConfigLabelChanged", gpuWorkloadConfigLabelChanged,
×
312
                                        "osTreeLabelChanged", osTreeLabelChanged,
×
313
                                )
×
314
                        }
×
315
                        return needsUpdate
×
316
                },
317
                DeleteFunc: func(e event.TypedDeleteEvent[*corev1.Node]) bool {
×
318
                        // if an RHCOS GPU node is deleted, trigger a
×
319
                        // reconciliation to ensure that there is no dangling
×
320
                        // OpenShift Driver-Toolkit (RHCOS version-specific)
×
321
                        // DaemonSet.
×
322
                        // NB: we cannot know here if the DriverToolkit is
×
323
                        // enabled.
×
324

×
325
                        labels := e.Object.GetLabels()
×
326

×
327
                        _, hasOSTreeLabel := labels[nfdOSTreeVersionLabelKey]
×
328

×
329
                        return hasGPULabels(labels) && hasOSTreeLabel
×
330
                },
×
331
        }
332

333
        err := c.Watch(
×
334
                source.Kind(mgr.GetCache(),
×
335
                        &corev1.Node{},
×
336
                        handler.TypedEnqueueRequestsFromMapFunc[*corev1.Node](mapFn),
×
337
                        p,
×
338
                ),
×
339
        )
×
340

×
341
        return err
×
342
}
343

344
// SetupWithManager sets up the controller with the Manager.
345
func (r *ClusterPolicyReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
×
346
        // Create a new controller
×
347
        c, err := controller.New("clusterpolicy-controller", mgr, controller.Options{Reconciler: r, MaxConcurrentReconciles: 1,
×
348
                RateLimiter: workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](minDelayCR, maxDelayCR)})
×
349
        if err != nil {
×
350
                return err
×
351
        }
×
352

353
        // initialize condition updater
354
        r.conditionUpdater = conditions.NewClusterPolicyUpdater(mgr.GetClient())
×
355

×
356
        // Watch for changes to primary resource ClusterPolicy
×
357
        err = c.Watch(source.Kind(
×
358
                mgr.GetCache(),
×
359
                &gpuv1.ClusterPolicy{},
×
360
                &handler.TypedEnqueueRequestForObject[*gpuv1.ClusterPolicy]{},
×
361
                predicate.TypedGenerationChangedPredicate[*gpuv1.ClusterPolicy]{},
×
362
        ),
×
363
        )
×
364
        if err != nil {
×
365
                return err
×
366
        }
×
367

368
        // Watch for changes to Node labels and requeue the owner ClusterPolicy
369
        err = addWatchNewGPUNode(r, c, mgr)
×
370
        if err != nil {
×
371
                return err
×
372
        }
×
373

374
        // TODO(user): Modify this to be the types you create that are owned by the primary resource
375
        // Watch for changes to secondary resource Daemonsets and requeue the owner ClusterPolicy
376
        err = c.Watch(
×
377
                source.Kind(mgr.GetCache(),
×
378
                        &appsv1.DaemonSet{},
×
379
                        handler.TypedEnqueueRequestForOwner[*appsv1.DaemonSet](mgr.GetScheme(), mgr.GetRESTMapper(), &gpuv1.ClusterPolicy{},
×
380
                                handler.OnlyControllerOwner()),
×
381
                ),
×
382
        )
×
383
        if err != nil {
×
384
                return err
×
385
        }
×
386

387
        // Add an index key which allows our reconciler to quickly look up DaemonSets owned by it.
388
        //
389
        // (cdesiniotis) Ideally we could duplicate this index for all the k8s objects
390
        // that ClusterPolicy manages, that way, we could easily restrict the ClusterPolicy
391
        // controller to only update / delete objects it owns. Unfortunately, the
392
        // underlying implementation of the index does not support generic container types
393
        // (i.e. unstructured.Unstructured{}). For additional details, see the comment in
394
        // the last link of the below call stack:
395
        // IndexField(): https://github.com/kubernetes-sigs/controller-runtime/blob/main/pkg/cache/informer_cache.go#L204
396
        //   GetInformer(): https://github.com/kubernetes-sigs/controller-runtime/blob/main/pkg/cache/informer_cache.go#L168
397
        //     GVKForObject(): https://github.com/kubernetes-sigs/controller-runtime/blob/main/pkg/client/apiutil/apimachinery.go#L113
398
        if err := mgr.GetFieldIndexer().IndexField(ctx, &appsv1.DaemonSet{}, clusterPolicyControllerIndexKey, func(rawObj client.Object) []string {
×
399
                ds := rawObj.(*appsv1.DaemonSet)
×
400
                owner := metav1.GetControllerOf(ds)
×
401
                if owner == nil {
×
402
                        return nil
×
403
                }
×
404
                if owner.APIVersion != gpuv1.SchemeGroupVersion.String() || owner.Kind != "ClusterPolicy" {
×
405
                        return nil
×
406
                }
×
407
                return []string{owner.Name}
×
408
        }); err != nil {
×
409
                return fmt.Errorf("failed to add index key: %w", err)
×
410
        }
×
411

412
        return nil
×
413
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc