• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

NVIDIA / gpu-operator / 28617391944

02 Jul 2026 07:40PM UTC coverage: 31.944% (+0.7%) from 31.227%
28617391944

Pull #2571

github

karthikvetrivel
Add GPUCluster Helm install with ClusterPolicy/NVIDIADriver coexistence

Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
Pull Request #2571: Add GPUCluster CRD and controller for DRA-based stack

518 of 1383 new or added lines in 28 files covered. (37.45%)

19 existing lines in 4 files now uncovered.

4630 of 14494 relevant lines covered (31.94%)

0.36 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

43.75
/controllers/gpucluster_controller.go
1
/*
2
Copyright 2025.
3

4
Licensed under the Apache License, Version 2.0 (the "License");
5
you may not use this file except in compliance with the License.
6
You may obtain a copy of the License at
7

8
    http://www.apache.org/licenses/LICENSE-2.0
9

10
Unless required by applicable law or agreed to in writing, software
11
distributed under the License is distributed on an "AS IS" BASIS,
12
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
See the License for the specific language governing permissions and
14
limitations under the License.
15
*/
16

17
package controllers
18

19
import (
20
        "context"
21
        "fmt"
22
        "time"
23

24
        apierrors "k8s.io/apimachinery/pkg/api/errors"
25
        "k8s.io/apimachinery/pkg/runtime"
26
        "k8s.io/apimachinery/pkg/types"
27
        "k8s.io/client-go/util/workqueue"
28
        ctrl "sigs.k8s.io/controller-runtime"
29
        "sigs.k8s.io/controller-runtime/pkg/client"
30
        "sigs.k8s.io/controller-runtime/pkg/controller"
31
        "sigs.k8s.io/controller-runtime/pkg/handler"
32
        "sigs.k8s.io/controller-runtime/pkg/log"
33
        "sigs.k8s.io/controller-runtime/pkg/predicate"
34
        "sigs.k8s.io/controller-runtime/pkg/reconcile"
35
        "sigs.k8s.io/controller-runtime/pkg/source"
36

37
        nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
38
        "github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
39
        "github.com/NVIDIA/gpu-operator/internal/conditions"
40
        "github.com/NVIDIA/gpu-operator/internal/consts"
41
        "github.com/NVIDIA/gpu-operator/internal/state"
42
)
43

44
// GPUClusterReconciler reconciles a GPUCluster object
45
type GPUClusterReconciler struct {
46
        client.Client
47
        Scheme      *runtime.Scheme
48
        ClusterInfo clusterinfo.Interface
49
        Namespace   string
50

51
        stateManager     state.Manager
52
        conditionUpdater conditions.Updater
53

54
        // singleton is the GPUCluster that owns reconciliation; the first instance to
55
        // reconcile claims it (first-wins), mirroring ClusterPolicy.
56
        singleton *nvidiav1alpha1.GPUCluster
57
}
58

59
//+kubebuilder:rbac:groups=nvidia.com,resources=gpuclusters,verbs=get;list;watch;create;update;patch;delete
60
//+kubebuilder:rbac:groups=nvidia.com,resources=gpuclusters/status,verbs=get;update;patch
61
//+kubebuilder:rbac:groups=nvidia.com,resources=gpuclusters/finalizers,verbs=update
62
//+kubebuilder:rbac:groups=nvidia.com,resources=clusterpolicies,verbs=get;list;watch
63
//+kubebuilder:rbac:groups="",resources=namespaces,verbs=get;update;patch
64
//+kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaimtemplates,verbs=get;list;watch;create;update;delete
65

66
func (r *GPUClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
1✔
67
        logger := log.FromContext(ctx)
1✔
68
        logger.V(consts.LogLevelInfo).Info("Reconciling GPUCluster")
1✔
69

1✔
70
        instance := &nvidiav1alpha1.GPUCluster{}
1✔
71
        if err := r.Get(ctx, req.NamespacedName, instance); err != nil {
2✔
72
                if apierrors.IsNotFound(err) {
2✔
73
                        // Deleted; owned objects are garbage-collected, so there is nothing to clean up.
1✔
74
                        return ctrl.Result{}, nil
1✔
75
                }
1✔
76
                // instance was not populated by the failed Get, so there is no object to
77
                // update status on; just surface the error for requeue.
NEW
78
                logger.Error(err, "error getting GPUCluster object")
×
NEW
79
                return ctrl.Result{}, fmt.Errorf("error getting GPUCluster object: %w", err)
×
80
        }
81

82
        // GPUCluster (DRA plane) may coexist with a ClusterPolicy (device-plugin
83
        // plane): every operand DaemonSet of both planes gates on the per-node
84
        // nvidia.com/gpu-operator.mode label, so each node is served by exactly one plane.
85

86
        // Singleton, first-wins (mirroring ClusterPolicy): the first instance to reconcile
87
        // claims ownership; any other instance is marked Ignored and skipped. The owner is
88
        // held in memory, so the choice resets on operator restart.
89
        if r.singleton != nil && r.singleton.Name != instance.Name {
2✔
90
                logger.V(consts.LogLevelWarning).Info("Multiple GPUCluster instances found, ignoring this one",
1✔
91
                        "name", instance.Name, "owner", r.singleton.Name)
1✔
92
                if err := r.updateCrStatus(ctx, instance, nvidiav1alpha1.Ignored); err != nil {
1✔
NEW
93
                        return ctrl.Result{}, err
×
NEW
94
                }
×
95
                return ctrl.Result{}, nil
1✔
96
        }
97
        r.singleton = instance
1✔
98

1✔
99
        infoCatalog := state.NewInfoCatalog()
1✔
100
        infoCatalog.Add(state.InfoTypeClusterInfo, r.ClusterInfo)
1✔
101

1✔
102
        managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog)
1✔
103

1✔
104
        if err := r.updateCrStatus(ctx, instance, nvidiav1alpha1.State(managerStatus.Status)); err != nil {
1✔
NEW
105
                return ctrl.Result{}, err
×
NEW
106
        }
×
107

108
        if managerStatus.Status != state.SyncStateReady {
1✔
NEW
109
                logger.Info("GPUCluster instance is not ready")
×
NEW
110
                for _, result := range managerStatus.StatesStatus {
×
NEW
111
                        if result.Status != state.SyncStateReady && result.ErrInfo != nil {
×
NEW
112
                                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, fmt.Sprintf("Error syncing state %s: %v", result.StateName, result.ErrInfo)); condErr != nil {
×
NEW
113
                                        logger.Error(condErr, "failed to set condition")
×
NEW
114
                                }
×
NEW
115
                                return ctrl.Result{RequeueAfter: time.Second * 5}, nil
×
116
                        }
117
                }
118
                // no state reported an error, so we are waiting on operand pods
NEW
119
                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.OperandNotReady, "Waiting for operand pods to be ready"); condErr != nil {
×
NEW
120
                        logger.Error(condErr, "failed to set condition")
×
NEW
121
                }
×
NEW
122
                return ctrl.Result{RequeueAfter: time.Second * 5}, nil
×
123
        }
124

125
        if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.Reconciled, "All resources have been successfully reconciled"); condErr != nil {
1✔
NEW
126
                logger.Error(condErr, "failed to set condition")
×
NEW
127
                return ctrl.Result{}, condErr
×
NEW
128
        }
×
129
        // Resync periodically so out-of-band changes (a deleted DeviceClass/VAP, or a
130
        // newly-created ClusterPolicy) are detected and reconciled even while ready;
131
        // only DaemonSets are watched, and the ready path is otherwise event-driven.
132
        return ctrl.Result{RequeueAfter: time.Minute}, nil
1✔
133
}
134

135
// updateCrStatus writes desired to the CR's status, skipping the write when it is already current.
136
func (r *GPUClusterReconciler) updateCrStatus(ctx context.Context, cr *nvidiav1alpha1.GPUCluster, desired nvidiav1alpha1.State) error {
1✔
137
        reqLogger := log.FromContext(ctx)
1✔
138

1✔
139
        // Refetch to avoid a resourceVersion conflict.
1✔
140
        instance := &nvidiav1alpha1.GPUCluster{}
1✔
141
        if err := r.Get(ctx, types.NamespacedName{Name: cr.Name}, instance); err != nil {
1✔
NEW
142
                reqLogger.Error(err, "Failed to get GPUCluster instance for status update")
×
NEW
143
                return err
×
NEW
144
        }
×
145

146
        if instance.Status.State == desired && instance.Status.Namespace == r.Namespace {
1✔
NEW
147
                return nil
×
NEW
148
        }
×
149
        instance.Status.State = desired
1✔
150
        instance.Status.Namespace = r.Namespace
1✔
151

1✔
152
        reqLogger.V(consts.LogLevelInfo).Info("Updating CR Status", "Status", instance.Status)
1✔
153
        if err := r.Status().Update(ctx, instance); err != nil {
1✔
NEW
154
                reqLogger.Error(err, "Failed to update CR status")
×
NEW
155
                return err
×
NEW
156
        }
×
157
        cr.Status.State = instance.Status.State
1✔
158
        cr.Status.Namespace = instance.Status.Namespace
1✔
159
        return nil
1✔
160
}
161

162
// enqueueAllGPUClusters enqueues every instance so each is reconciled when any
163
// instance or owned resource changes.
164
func (r *GPUClusterReconciler) enqueueAllGPUClusters(ctx context.Context, _ *nvidiav1alpha1.GPUCluster) []reconcile.Request {
1✔
165
        logger := log.FromContext(ctx)
1✔
166
        list := &nvidiav1alpha1.GPUClusterList{}
1✔
167

1✔
168
        if err := r.List(ctx, list); err != nil {
1✔
NEW
169
                logger.Error(err, "Unable to list GPUCluster resources")
×
NEW
170
                return []reconcile.Request{}
×
NEW
171
        }
×
172

173
        reconcileRequests := make([]reconcile.Request, 0, len(list.Items))
1✔
174
        for _, config := range list.Items {
2✔
175
                reconcileRequests = append(reconcileRequests,
1✔
176
                        reconcile.Request{
1✔
177
                                NamespacedName: types.NamespacedName{
1✔
178
                                        Name: config.GetName(),
1✔
179
                                },
1✔
180
                        })
1✔
181
        }
1✔
182

183
        return reconcileRequests
1✔
184
}
185

NEW
186
func (r *GPUClusterReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
×
NEW
187
        // The state manager renders the DRA driver operand for the GPUCluster.
×
NEW
188
        stateManager, err := state.NewManager(
×
NEW
189
                nvidiav1alpha1.GPUClusterCRDName,
×
NEW
190
                r.Namespace,
×
NEW
191
                mgr.GetClient(),
×
NEW
192
                mgr.GetScheme())
×
NEW
193
        if err != nil {
×
NEW
194
                return fmt.Errorf("error creating state manager: %v", err)
×
NEW
195
        }
×
NEW
196
        r.stateManager = stateManager
×
NEW
197

×
NEW
198
        r.conditionUpdater = conditions.NewGPUClusterUpdater(mgr.GetClient())
×
NEW
199

×
NEW
200
        c, err := controller.New("gpu-cluster-controller", mgr, controller.Options{
×
NEW
201
                Reconciler:              r,
×
NEW
202
                MaxConcurrentReconciles: 1,
×
NEW
203
                RateLimiter:             workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](minDelayCR, maxDelayCR),
×
NEW
204
        })
×
NEW
205
        if err != nil {
×
NEW
206
                return err
×
NEW
207
        }
×
208

NEW
209
        err = c.Watch(source.Kind(
×
NEW
210
                mgr.GetCache(),
×
NEW
211
                &nvidiav1alpha1.GPUCluster{},
×
NEW
212
                handler.TypedEnqueueRequestsFromMapFunc(r.enqueueAllGPUClusters),
×
NEW
213
                predicate.TypedGenerationChangedPredicate[*nvidiav1alpha1.GPUCluster]{},
×
NEW
214
        ),
×
NEW
215
        )
×
NEW
216
        if err != nil {
×
NEW
217
                return err
×
NEW
218
        }
×
219

220
        // Watch the secondary resources each state manager owns.
NEW
221
        watchSources := stateManager.GetWatchSources(mgr)
×
NEW
222
        for _, watchSource := range watchSources {
×
NEW
223
                err = c.Watch(
×
NEW
224
                        watchSource,
×
NEW
225
                )
×
NEW
226
                if err != nil {
×
NEW
227
                        return fmt.Errorf("error setting up Watch for source type %v: %w", watchSource, err)
×
NEW
228
                }
×
229
        }
230

NEW
231
        return nil
×
232
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc