• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

NVIDIA / gpu-operator / 26959561399

04 Jun 2026 02:51PM UTC coverage: 28.844%. Remained the same
26959561399

push

github

karthikvetrivel
Add GPUClusterConfig controller with singleton status handling

Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>

56 of 220 new or added lines in 4 files covered. (25.45%)

8 existing lines in 2 files now uncovered.

3754 of 13015 relevant lines covered (28.84%)

0.33 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

40.29
/controllers/gpuclusterconfig_controller.go
1
/*
2
Copyright 2025.
3

4
Licensed under the Apache License, Version 2.0 (the "License");
5
you may not use this file except in compliance with the License.
6
You may obtain a copy of the License at
7

8
    http://www.apache.org/licenses/LICENSE-2.0
9

10
Unless required by applicable law or agreed to in writing, software
11
distributed under the License is distributed on an "AS IS" BASIS,
12
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
See the License for the specific language governing permissions and
14
limitations under the License.
15
*/
16

17
package controllers
18

19
import (
20
        "context"
21
        "fmt"
22
        "time"
23

24
        apierrors "k8s.io/apimachinery/pkg/api/errors"
25
        "k8s.io/apimachinery/pkg/runtime"
26
        "k8s.io/apimachinery/pkg/types"
27
        "k8s.io/client-go/util/workqueue"
28
        ctrl "sigs.k8s.io/controller-runtime"
29
        "sigs.k8s.io/controller-runtime/pkg/client"
30
        "sigs.k8s.io/controller-runtime/pkg/controller"
31
        "sigs.k8s.io/controller-runtime/pkg/handler"
32
        "sigs.k8s.io/controller-runtime/pkg/log"
33
        "sigs.k8s.io/controller-runtime/pkg/predicate"
34
        "sigs.k8s.io/controller-runtime/pkg/reconcile"
35
        "sigs.k8s.io/controller-runtime/pkg/source"
36

37
        nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
38
        "github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
39
        "github.com/NVIDIA/gpu-operator/internal/conditions"
40
        "github.com/NVIDIA/gpu-operator/internal/consts"
41
        "github.com/NVIDIA/gpu-operator/internal/state"
42
)
43

44
// GPUClusterConfigReconciler reconciles a GPUClusterConfig object
45
type GPUClusterConfigReconciler struct {
46
        client.Client
47
        Scheme      *runtime.Scheme
48
        ClusterInfo clusterinfo.Interface
49
        Namespace   string
50

51
        stateManager     state.Manager
52
        conditionUpdater conditions.Updater
53

54
        // singleton is the GPUClusterConfig that owns reconciliation; the first instance to
55
        // reconcile claims it (first-wins), mirroring ClusterPolicy.
56
        singleton *nvidiav1alpha1.GPUClusterConfig
57
}
58

59
//+kubebuilder:rbac:groups=nvidia.com,resources=gpuclusterconfigs,verbs=get;list;watch;create;update;patch;delete
60
//+kubebuilder:rbac:groups=nvidia.com,resources=gpuclusterconfigs/status,verbs=get;update;patch
61
//+kubebuilder:rbac:groups=nvidia.com,resources=gpuclusterconfigs/finalizers,verbs=update
62

63
func (r *GPUClusterConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
1✔
64
        logger := log.FromContext(ctx)
1✔
65
        logger.V(consts.LogLevelInfo).Info("Reconciling GPUClusterConfig")
1✔
66

1✔
67
        instance := &nvidiav1alpha1.GPUClusterConfig{}
1✔
68
        if err := r.Get(ctx, req.NamespacedName, instance); err != nil {
2✔
69
                if apierrors.IsNotFound(err) {
2✔
70
                        // Deleted; owned objects are garbage-collected, so there is nothing to clean up.
1✔
71
                        return reconcile.Result{}, nil
1✔
72
                }
1✔
NEW
73
                wrappedErr := fmt.Errorf("error getting GPUClusterConfig object: %w", err)
×
NEW
74
                logger.Error(err, "error getting GPUClusterConfig object")
×
NEW
75
                instance.Status.State = nvidiav1alpha1.NotReady
×
NEW
76
                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, wrappedErr.Error()); condErr != nil {
×
NEW
77
                        logger.Error(condErr, "failed to set condition")
×
NEW
78
                }
×
NEW
79
                return reconcile.Result{}, wrappedErr
×
80
        }
81

82
        // Singleton, first-wins (mirroring ClusterPolicy): the first instance to reconcile
83
        // claims ownership; any other instance is marked Ignored and skipped. The owner is
84
        // held in memory, so the choice resets on operator restart.
85
        if r.singleton != nil && r.singleton.Name != instance.Name {
2✔
86
                logger.V(consts.LogLevelWarning).Info("Multiple GPUClusterConfig instances found, ignoring this one",
1✔
87
                        "name", instance.Name, "owner", r.singleton.Name)
1✔
88
                if err := r.updateCrStatus(ctx, instance, nvidiav1alpha1.Ignored); err != nil {
1✔
NEW
89
                        return reconcile.Result{}, err
×
NEW
90
                }
×
91
                return reconcile.Result{}, nil
1✔
92
        }
93
        r.singleton = instance
1✔
94

1✔
95
        infoCatalog := state.NewInfoCatalog()
1✔
96
        infoCatalog.Add(state.InfoTypeClusterInfo, r.ClusterInfo)
1✔
97

1✔
98
        managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog)
1✔
99

1✔
100
        if err := r.updateCrStatus(ctx, instance, nvidiav1alpha1.State(managerStatus.Status)); err != nil {
1✔
NEW
101
                return ctrl.Result{}, err
×
NEW
102
        }
×
103

104
        if managerStatus.Status != state.SyncStateReady {
1✔
NEW
105
                logger.Info("GPUClusterConfig instance is not ready")
×
NEW
106
                var errorInfo error
×
NEW
107
                for _, result := range managerStatus.StatesStatus {
×
NEW
108
                        if result.Status != state.SyncStateReady && result.ErrInfo != nil {
×
NEW
109
                                errorInfo = result.ErrInfo
×
NEW
110
                                if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, fmt.Sprintf("Error syncing state %s: %v", result.StateName, errorInfo.Error())); condErr != nil {
×
NEW
111
                                        logger.Error(condErr, "failed to set condition")
×
NEW
112
                                }
×
NEW
113
                                break
×
114
                        }
115
                }
116
                // if no errors are reported from any state, then we are waiting on operand pods
NEW
117
                if errorInfo == nil {
×
NEW
118
                        if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.OperandNotReady, "Waiting for operand pods to be ready"); condErr != nil {
×
NEW
119
                                logger.Error(condErr, "failed to set condition")
×
NEW
120
                        }
×
121
                }
NEW
122
                return reconcile.Result{RequeueAfter: time.Second * 5}, nil
×
123
        }
124

125
        if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.Reconciled, "All resources have been successfully reconciled"); condErr != nil {
1✔
NEW
126
                logger.Error(condErr, "failed to set condition")
×
NEW
127
                return ctrl.Result{}, condErr
×
NEW
128
        }
×
129
        return reconcile.Result{}, nil
1✔
130
}
131

132
// updateCrStatus writes desired to the CR's status, skipping the write when it is already current.
133
func (r *GPUClusterConfigReconciler) updateCrStatus(ctx context.Context, cr *nvidiav1alpha1.GPUClusterConfig, desired nvidiav1alpha1.State) error {
1✔
134
        reqLogger := log.FromContext(ctx)
1✔
135

1✔
136
        // Refetch to avoid a resourceVersion conflict.
1✔
137
        instance := &nvidiav1alpha1.GPUClusterConfig{}
1✔
138
        if err := r.Get(ctx, types.NamespacedName{Name: cr.Name}, instance); err != nil {
1✔
NEW
139
                reqLogger.Error(err, "Failed to get GPUClusterConfig instance for status update")
×
NEW
140
                return err
×
NEW
141
        }
×
142

143
        if instance.Status.State == desired && instance.Status.Namespace == r.Namespace {
1✔
NEW
144
                return nil
×
NEW
145
        }
×
146
        instance.Status.State = desired
1✔
147
        instance.Status.Namespace = r.Namespace
1✔
148

1✔
149
        reqLogger.V(consts.LogLevelInfo).Info("Updating CR Status", "Status", instance.Status)
1✔
150
        if err := r.Status().Update(ctx, instance); err != nil {
1✔
NEW
151
                reqLogger.Error(err, "Failed to update CR status")
×
NEW
152
                return err
×
NEW
153
        }
×
154
        cr.Status.State = instance.Status.State
1✔
155
        cr.Status.Namespace = instance.Status.Namespace
1✔
156
        return nil
1✔
157
}
158

159
// enqueueAllGPUClusterConfigs enqueues every instance so each is reconciled when any
160
// instance or owned resource changes.
161
func (r *GPUClusterConfigReconciler) enqueueAllGPUClusterConfigs(ctx context.Context) []reconcile.Request {
1✔
162
        logger := log.FromContext(ctx)
1✔
163
        list := &nvidiav1alpha1.GPUClusterConfigList{}
1✔
164

1✔
165
        if err := r.List(ctx, list); err != nil {
1✔
NEW
166
                logger.Error(err, "Unable to list GPUClusterConfig resources")
×
NEW
167
                return []reconcile.Request{}
×
NEW
168
        }
×
169

170
        reconcileRequests := make([]reconcile.Request, 0, len(list.Items))
1✔
171
        for _, config := range list.Items {
2✔
172
                reconcileRequests = append(reconcileRequests,
1✔
173
                        reconcile.Request{
1✔
174
                                NamespacedName: types.NamespacedName{
1✔
175
                                        Name: config.GetName(),
1✔
176
                                },
1✔
177
                        })
1✔
178
        }
1✔
179

180
        return reconcileRequests
1✔
181
}
182

NEW
183
func (r *GPUClusterConfigReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
×
NEW
184
        // No operands are rendered yet (empty state set).
×
NEW
185
        stateManager, err := state.NewManager(
×
NEW
186
                nvidiav1alpha1.GPUClusterConfigCRDName,
×
NEW
187
                r.Namespace,
×
NEW
188
                mgr.GetClient(),
×
NEW
189
                mgr.GetScheme())
×
NEW
190
        if err != nil {
×
NEW
191
                return fmt.Errorf("error creating state manager: %v", err)
×
NEW
192
        }
×
NEW
193
        r.stateManager = stateManager
×
NEW
194

×
NEW
195
        r.conditionUpdater = conditions.NewGPUClusterConfigUpdater(mgr.GetClient())
×
NEW
196

×
NEW
197
        c, err := controller.New("gpu-cluster-config-controller", mgr, controller.Options{
×
NEW
198
                Reconciler:              r,
×
NEW
199
                MaxConcurrentReconciles: 1,
×
NEW
200
                RateLimiter:             workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](minDelayCR, maxDelayCR),
×
NEW
201
        })
×
NEW
202
        if err != nil {
×
NEW
203
                return err
×
NEW
204
        }
×
205

NEW
206
        gpuClusterConfigMapFn := func(ctx context.Context, _ *nvidiav1alpha1.GPUClusterConfig) []reconcile.Request {
×
NEW
207
                return r.enqueueAllGPUClusterConfigs(ctx)
×
NEW
208
        }
×
209

NEW
210
        err = c.Watch(source.Kind(
×
NEW
211
                mgr.GetCache(),
×
NEW
212
                &nvidiav1alpha1.GPUClusterConfig{},
×
NEW
213
                handler.TypedEnqueueRequestsFromMapFunc(gpuClusterConfigMapFn),
×
NEW
214
                predicate.TypedGenerationChangedPredicate[*nvidiav1alpha1.GPUClusterConfig]{},
×
NEW
215
        ),
×
NEW
216
        )
×
NEW
217
        if err != nil {
×
NEW
218
                return err
×
NEW
219
        }
×
220

221
        // Watch the secondary resources each state manager owns.
NEW
222
        watchSources := stateManager.GetWatchSources(mgr)
×
NEW
223
        for _, watchSource := range watchSources {
×
NEW
224
                err = c.Watch(
×
NEW
225
                        watchSource,
×
NEW
226
                )
×
NEW
227
                if err != nil {
×
NEW
228
                        return fmt.Errorf("error setting up Watch for source type %v: %w", watchSource, err)
×
NEW
229
                }
×
230
        }
231

NEW
232
        return nil
×
233
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc