• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

NVIDIA / gpu-operator / 28543226650

01 Jul 2026 07:42PM UTC coverage: 31.429% (+0.2%) from 31.227%
28543226650

Pull #2571

github

karthikvetrivel
Add DCGM, DCGM Exporter, and DRA validator operands to GPUClusterConfig

Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
Pull Request #2571: Add GPUClusterConfig CRD and controller for DRA-based stack

416 of 1264 new or added lines in 25 files covered. (32.91%)

19 existing lines in 4 files now uncovered.

4522 of 14388 relevant lines covered (31.43%)

0.36 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

45.07
/controllers/gpuclusterconfig_controller.go
1
/*
2
Copyright 2025.
3

4
Licensed under the Apache License, Version 2.0 (the "License");
5
you may not use this file except in compliance with the License.
6
You may obtain a copy of the License at
7

8
    http://www.apache.org/licenses/LICENSE-2.0
9

10
Unless required by applicable law or agreed to in writing, software
11
distributed under the License is distributed on an "AS IS" BASIS,
12
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
See the License for the specific language governing permissions and
14
limitations under the License.
15
*/
16

17
package controllers
18

19
import (
20
        "context"
21
        "fmt"
22
        "time"
23

24
        apierrors "k8s.io/apimachinery/pkg/api/errors"
25
        "k8s.io/apimachinery/pkg/runtime"
26
        "k8s.io/apimachinery/pkg/types"
27
        "k8s.io/client-go/util/workqueue"
28
        ctrl "sigs.k8s.io/controller-runtime"
29
        "sigs.k8s.io/controller-runtime/pkg/client"
30
        "sigs.k8s.io/controller-runtime/pkg/controller"
31
        "sigs.k8s.io/controller-runtime/pkg/handler"
32
        "sigs.k8s.io/controller-runtime/pkg/log"
33
        "sigs.k8s.io/controller-runtime/pkg/predicate"
34
        "sigs.k8s.io/controller-runtime/pkg/reconcile"
35
        "sigs.k8s.io/controller-runtime/pkg/source"
36

37
        nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
38
        "github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
39
        "github.com/NVIDIA/gpu-operator/internal/conditions"
40
        "github.com/NVIDIA/gpu-operator/internal/consts"
41
        "github.com/NVIDIA/gpu-operator/internal/state"
42
)
43

44
// GPUClusterConfigReconciler reconciles a GPUClusterConfig object
45
type GPUClusterConfigReconciler struct {
46
        client.Client
47
        Scheme      *runtime.Scheme
48
        ClusterInfo clusterinfo.Interface
49
        Namespace   string
50

51
        stateManager     state.Manager
52
        conditionUpdater conditions.Updater
53

54
        // singleton is the GPUClusterConfig that owns reconciliation; the first instance to
55
        // reconcile claims it (first-wins), mirroring ClusterPolicy.
56
        singleton *nvidiav1alpha1.GPUClusterConfig
57
}
58

59
//+kubebuilder:rbac:groups=nvidia.com,resources=gpuclusterconfigs,verbs=get;list;watch;create;update;patch;delete
60
//+kubebuilder:rbac:groups=nvidia.com,resources=gpuclusterconfigs/status,verbs=get;update;patch
61
//+kubebuilder:rbac:groups=nvidia.com,resources=gpuclusterconfigs/finalizers,verbs=update
62
//+kubebuilder:rbac:groups=nvidia.com,resources=clusterpolicies,verbs=get;list;watch
63
//+kubebuilder:rbac:groups="",resources=namespaces,verbs=get;update;patch
64
//+kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaimtemplates,verbs=get;list;watch;create;update;delete
65

66
func (r *GPUClusterConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
1✔
67
        logger := log.FromContext(ctx)
1✔
68
        logger.V(consts.LogLevelInfo).Info("Reconciling GPUClusterConfig")
1✔
69

1✔
70
        instance := &nvidiav1alpha1.GPUClusterConfig{}
1✔
71
        if err := r.Get(ctx, req.NamespacedName, instance); err != nil {
2✔
72
                if apierrors.IsNotFound(err) {
2✔
73
                        // Deleted; owned objects are garbage-collected, so there is nothing to clean up.
1✔
74
                        return ctrl.Result{}, nil
1✔
75
                }
1✔
76
                // instance was not populated by the failed Get, so there is no object to
77
                // update status on; just surface the error for requeue.
NEW
78
                logger.Error(err, "error getting GPUClusterConfig object")
×
NEW
79
                return ctrl.Result{}, fmt.Errorf("error getting GPUClusterConfig object: %w", err)
×
80
        }
81

82
        // GPUClusterConfig (DRA path) is mutually exclusive with ClusterPolicy: if one
83
        // exists, yield to it rather than deploying the DRA stack alongside it.
84
        clusterPolicy, _, err := resolveActiveConfig(ctx, r.Client)
1✔
85
        if err != nil {
1✔
NEW
86
                return ctrl.Result{}, err
×
NEW
87
        }
×
88
        if clusterPolicy != nil {
2✔
89
                logger.V(consts.LogLevelWarning).Info("ClusterPolicy present, skipping mutually exclusive GPUClusterConfig")
1✔
90
                if err := r.updateCrStatus(ctx, instance, nvidiav1alpha1.Disabled); err != nil {
1✔
NEW
91
                        return ctrl.Result{}, err
×
NEW
92
                }
×
93
                msg := "GPUClusterConfig is mutually exclusive with ClusterPolicy; remove the ClusterPolicy or disable GPUClusterConfig"
1✔
94
                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, msg); condErr != nil {
1✔
NEW
95
                        logger.Error(condErr, "failed to set condition")
×
NEW
96
                }
×
97
                return ctrl.Result{}, nil
1✔
98
        }
99

100
        // Singleton, first-wins (mirroring ClusterPolicy): the first instance to reconcile
101
        // claims ownership; any other instance is marked Ignored and skipped. The owner is
102
        // held in memory, so the choice resets on operator restart.
103
        if r.singleton != nil && r.singleton.Name != instance.Name {
2✔
104
                logger.V(consts.LogLevelWarning).Info("Multiple GPUClusterConfig instances found, ignoring this one",
1✔
105
                        "name", instance.Name, "owner", r.singleton.Name)
1✔
106
                if err := r.updateCrStatus(ctx, instance, nvidiav1alpha1.Ignored); err != nil {
1✔
NEW
107
                        return ctrl.Result{}, err
×
NEW
108
                }
×
109
                return ctrl.Result{}, nil
1✔
110
        }
111
        r.singleton = instance
1✔
112

1✔
113
        infoCatalog := state.NewInfoCatalog()
1✔
114
        infoCatalog.Add(state.InfoTypeClusterInfo, r.ClusterInfo)
1✔
115

1✔
116
        managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog)
1✔
117

1✔
118
        if err := r.updateCrStatus(ctx, instance, nvidiav1alpha1.State(managerStatus.Status)); err != nil {
1✔
NEW
119
                return ctrl.Result{}, err
×
NEW
120
        }
×
121

122
        if managerStatus.Status != state.SyncStateReady {
1✔
NEW
123
                logger.Info("GPUClusterConfig instance is not ready")
×
NEW
124
                for _, result := range managerStatus.StatesStatus {
×
NEW
125
                        if result.Status != state.SyncStateReady && result.ErrInfo != nil {
×
NEW
126
                                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, fmt.Sprintf("Error syncing state %s: %v", result.StateName, result.ErrInfo)); condErr != nil {
×
NEW
127
                                        logger.Error(condErr, "failed to set condition")
×
NEW
128
                                }
×
NEW
129
                                return ctrl.Result{RequeueAfter: time.Second * 5}, nil
×
130
                        }
131
                }
132
                // no state reported an error, so we are waiting on operand pods
NEW
133
                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.OperandNotReady, "Waiting for operand pods to be ready"); condErr != nil {
×
NEW
134
                        logger.Error(condErr, "failed to set condition")
×
NEW
135
                }
×
NEW
136
                return ctrl.Result{RequeueAfter: time.Second * 5}, nil
×
137
        }
138

139
        if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.Reconciled, "All resources have been successfully reconciled"); condErr != nil {
1✔
NEW
140
                logger.Error(condErr, "failed to set condition")
×
NEW
141
                return ctrl.Result{}, condErr
×
NEW
142
        }
×
143
        // Resync periodically so out-of-band changes (a deleted DeviceClass/VAP, or a
144
        // newly-created ClusterPolicy) are detected and reconciled even while ready;
145
        // only DaemonSets are watched, and the ready path is otherwise event-driven.
146
        return ctrl.Result{RequeueAfter: time.Minute}, nil
1✔
147
}
148

149
// updateCrStatus writes desired to the CR's status, skipping the write when it is already current.
150
func (r *GPUClusterConfigReconciler) updateCrStatus(ctx context.Context, cr *nvidiav1alpha1.GPUClusterConfig, desired nvidiav1alpha1.State) error {
1✔
151
        reqLogger := log.FromContext(ctx)
1✔
152

1✔
153
        // Refetch to avoid a resourceVersion conflict.
1✔
154
        instance := &nvidiav1alpha1.GPUClusterConfig{}
1✔
155
        if err := r.Get(ctx, types.NamespacedName{Name: cr.Name}, instance); err != nil {
1✔
NEW
156
                reqLogger.Error(err, "Failed to get GPUClusterConfig instance for status update")
×
NEW
157
                return err
×
NEW
158
        }
×
159

160
        if instance.Status.State == desired && instance.Status.Namespace == r.Namespace {
1✔
NEW
161
                return nil
×
NEW
162
        }
×
163
        instance.Status.State = desired
1✔
164
        instance.Status.Namespace = r.Namespace
1✔
165

1✔
166
        reqLogger.V(consts.LogLevelInfo).Info("Updating CR Status", "Status", instance.Status)
1✔
167
        if err := r.Status().Update(ctx, instance); err != nil {
1✔
NEW
168
                reqLogger.Error(err, "Failed to update CR status")
×
NEW
169
                return err
×
NEW
170
        }
×
171
        cr.Status.State = instance.Status.State
1✔
172
        cr.Status.Namespace = instance.Status.Namespace
1✔
173
        return nil
1✔
174
}
175

176
// enqueueAllGPUClusterConfigs enqueues every instance so each is reconciled when any
177
// instance or owned resource changes.
178
func (r *GPUClusterConfigReconciler) enqueueAllGPUClusterConfigs(ctx context.Context, _ *nvidiav1alpha1.GPUClusterConfig) []reconcile.Request {
1✔
179
        logger := log.FromContext(ctx)
1✔
180
        list := &nvidiav1alpha1.GPUClusterConfigList{}
1✔
181

1✔
182
        if err := r.List(ctx, list); err != nil {
1✔
NEW
183
                logger.Error(err, "Unable to list GPUClusterConfig resources")
×
NEW
184
                return []reconcile.Request{}
×
NEW
185
        }
×
186

187
        reconcileRequests := make([]reconcile.Request, 0, len(list.Items))
1✔
188
        for _, config := range list.Items {
2✔
189
                reconcileRequests = append(reconcileRequests,
1✔
190
                        reconcile.Request{
1✔
191
                                NamespacedName: types.NamespacedName{
1✔
192
                                        Name: config.GetName(),
1✔
193
                                },
1✔
194
                        })
1✔
195
        }
1✔
196

197
        return reconcileRequests
1✔
198
}
199

NEW
200
func (r *GPUClusterConfigReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
×
NEW
201
        // The state manager renders the DRA driver operand for the GPUClusterConfig.
×
NEW
202
        stateManager, err := state.NewManager(
×
NEW
203
                nvidiav1alpha1.GPUClusterConfigCRDName,
×
NEW
204
                r.Namespace,
×
NEW
205
                mgr.GetClient(),
×
NEW
206
                mgr.GetScheme())
×
NEW
207
        if err != nil {
×
NEW
208
                return fmt.Errorf("error creating state manager: %v", err)
×
NEW
209
        }
×
NEW
210
        r.stateManager = stateManager
×
NEW
211

×
NEW
212
        r.conditionUpdater = conditions.NewGPUClusterConfigUpdater(mgr.GetClient())
×
NEW
213

×
NEW
214
        c, err := controller.New("gpu-cluster-config-controller", mgr, controller.Options{
×
NEW
215
                Reconciler:              r,
×
NEW
216
                MaxConcurrentReconciles: 1,
×
NEW
217
                RateLimiter:             workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](minDelayCR, maxDelayCR),
×
NEW
218
        })
×
NEW
219
        if err != nil {
×
NEW
220
                return err
×
NEW
221
        }
×
222

NEW
223
        err = c.Watch(source.Kind(
×
NEW
224
                mgr.GetCache(),
×
NEW
225
                &nvidiav1alpha1.GPUClusterConfig{},
×
NEW
226
                handler.TypedEnqueueRequestsFromMapFunc(r.enqueueAllGPUClusterConfigs),
×
NEW
227
                predicate.TypedGenerationChangedPredicate[*nvidiav1alpha1.GPUClusterConfig]{},
×
NEW
228
        ),
×
NEW
229
        )
×
NEW
230
        if err != nil {
×
NEW
231
                return err
×
NEW
232
        }
×
233

234
        // Watch the secondary resources each state manager owns.
NEW
235
        watchSources := stateManager.GetWatchSources(mgr)
×
NEW
236
        for _, watchSource := range watchSources {
×
NEW
237
                err = c.Watch(
×
NEW
238
                        watchSource,
×
NEW
239
                )
×
NEW
240
                if err != nil {
×
NEW
241
                        return fmt.Errorf("error setting up Watch for source type %v: %w", watchSource, err)
×
NEW
242
                }
×
243
        }
244

NEW
245
        return nil
×
246
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc