• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

k8snetworkplumbingwg / sriov-network-operator / 19855289298

02 Dec 2025 10:24AM UTC coverage: 62.126% (-0.02%) from 62.149%
19855289298

Pull #967

github

web-flow
Merge e77928524 into d34e85b1c
Pull Request #967: Add support for network interface alternative names

70 of 91 new or added lines in 7 files covered. (76.92%)

21 existing lines in 6 files now uncovered.

8820 of 14197 relevant lines covered (62.13%)

0.69 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

44.99
/pkg/daemon/daemon.go
1
package daemon
2

3
import (
4
        "context"
5
        "fmt"
6
        "time"
7

8
        corev1 "k8s.io/api/core/v1"
9
        "k8s.io/apimachinery/pkg/api/errors"
10
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
11
        "k8s.io/apimachinery/pkg/util/wait"
12
        ctrl "sigs.k8s.io/controller-runtime"
13
        "sigs.k8s.io/controller-runtime/pkg/client"
14
        "sigs.k8s.io/controller-runtime/pkg/controller"
15
        "sigs.k8s.io/controller-runtime/pkg/log"
16
        "sigs.k8s.io/controller-runtime/pkg/predicate"
17

18
        sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
19
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
20
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
21
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper"
22
        hosttypes "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/host/types"
23
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platform"
24
        plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins"
25
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils"
26
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
27
)
28

29
// NodeReconciler struct holds various components necessary for reconciling an SR-IOV node.
30
// It includes a Kubernetes client, SR-IOV client, and other utility interfaces.
31
// The struct is designed to manage the lifecycle of an SR-IOV devices on a given node.
32
type NodeReconciler struct {
33
        client client.Client
34

35
        hostHelpers       helper.HostHelpersInterface
36
        platformInterface platform.Interface
37

38
        eventRecorder *EventRecorder
39

40
        featureGate featuregate.FeatureGate
41

42
        additionalPlugins []plugin.VendorPlugin
43
        mainPlugin        plugin.VendorPlugin
44

45
        lastAppliedGeneration int64
46
}
47

48
// New creates a new instance of NodeReconciler.
49
func New(
50
        client client.Client,
51
        hostHelpers helper.HostHelpersInterface,
52
        platformInterface platform.Interface,
53
        er *EventRecorder,
54
        featureGates featuregate.FeatureGate,
55
) *NodeReconciler {
1✔
56
        return &NodeReconciler{
1✔
57
                client:            client,
1✔
58
                hostHelpers:       hostHelpers,
1✔
59
                platformInterface: platformInterface,
1✔
60

1✔
61
                lastAppliedGeneration: 0,
1✔
62
                eventRecorder:         er,
1✔
63
                featureGate:           featureGates,
1✔
64
        }
1✔
65
}
1✔
66

67
// Init initializes the Sriov Network Operator daemon.
68
// It enables kernel modules, prepare udev rules and load the host network state
69
func (dn *NodeReconciler) Init(disabledPlugins []string) error {
1✔
70
        funcLog := log.Log.WithName("Init")
1✔
71

1✔
72
        if !vars.UsingSystemdMode {
2✔
73
                funcLog.V(0).Info("daemon running in daemon mode")
1✔
74
                _, err := dn.hostHelpers.CheckRDMAEnabled()
1✔
75
                if err != nil {
1✔
76
                        funcLog.Error(err, "warning, failed to check RDMA state")
×
77
                }
×
78
                dn.hostHelpers.TryEnableTun()
1✔
79
                dn.hostHelpers.TryEnableVhostNet()
1✔
80
                err = dn.hostHelpers.CleanSriovFilesFromHost(vars.ClusterType == consts.ClusterTypeOpenshift)
1✔
81
                if err != nil {
1✔
82
                        funcLog.Error(err, "failed to remove all the systemd sriov files")
×
83
                }
×
84
        } else {
×
85
                funcLog.V(0).Info("Run(): daemon running in systemd mode")
×
86
        }
×
87

88
        if err := dn.hostHelpers.PrepareNMUdevRule(); err != nil {
1✔
89
                funcLog.Error(err, "failed to prepare udev files to disable network manager on requested VFs")
×
90
        }
×
91
        if err := dn.hostHelpers.PrepareVFRepUdevRule(); err != nil {
1✔
92
                funcLog.Error(err, "failed to prepare udev files to rename VF representors for requested VFs")
×
93
        }
×
94

95
        // init hypervisor info
96
        err := dn.platformInterface.Init()
1✔
97
        if err != nil {
1✔
98
                return err
×
99
        }
×
100

101
        // get interfaces
102
        ns := &sriovnetworkv1.SriovNetworkNodeState{}
1✔
103
        err = dn.updateStatusFromHost(ns)
1✔
104
        if err != nil {
1✔
105
                funcLog.Error(err, "failed to get host network status on init")
×
106
                return err
×
107
        }
×
108

109
        // init vendor plugins
110
        err = dn.loadPlugins(ns, disabledPlugins)
1✔
111
        if err != nil {
1✔
112
                funcLog.Error(err, "failed to load vendor plugins")
×
113
                return err
×
114
        }
×
115

116
        // save init state
117
        err = dn.hostHelpers.WriteCheckpointFile(ns)
1✔
118
        if err != nil {
1✔
119
                funcLog.Error(err, "failed to write checkpoint file on host")
×
120
        }
×
121
        return err
1✔
122
}
123

124
// Reconcile Reconciles the nodeState object by performing the following steps:
125
// 1. Retrieves the latest NodeState from the API server.
126
// 2. Checks if the object has the required drain controller annotations for the current generation.
127
// 3. Updates the nodeState Status object with the existing network state (interfaces, bridges, and RDMA status).
128
// 4. If running in systemd mode, checks the sriov result from the config-daemon that runs in systemd.
129
// 5. Compares the latest generation with the last applied generation to determine if a refresh on NICs is needed.
130
// 6. Checks for drift between the host state and the nodeState status.
131
// 7. Updates the sync state of the nodeState object as per the current requirements.
132
// 8. Determines if a drain is required based on the current state of the nodeState.
133
// 9. Handles the drain if necessary, ensuring that it does not conflict with other drain requests.
134
// 10. Applies the changes to the nodeState if there are no issues and updates the sync status accordingly.
135
// 11. If a reboot is required after applying the changes, returns a result to trigger a reboot.
136
//
137
// Returns a Result indicating whether or not the controller should requeue the request for further processing.
138
func (dn *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
1✔
139
        reqLogger := log.FromContext(ctx).WithName("Reconcile")
1✔
140
        // Get the latest NodeState
1✔
141
        desiredNodeState := &sriovnetworkv1.SriovNetworkNodeState{}
1✔
142
        err := dn.client.Get(ctx, client.ObjectKey{Namespace: req.Namespace, Name: req.Name}, desiredNodeState)
1✔
143
        if err != nil {
2✔
144
                if errors.IsNotFound(err) {
2✔
145
                        reqLogger.Info("NodeState doesn't exist")
1✔
146
                        return ctrl.Result{}, nil
1✔
147
                }
1✔
UNCOV
148
                reqLogger.Error(err, "Failed to fetch node state", "name", vars.NodeName)
×
UNCOV
149
                return ctrl.Result{}, err
×
150
        }
151

152
        // Check the object as the drain controller annotations
153
        // if not just wait for the drain controller to add them before we start taking care of the nodeState
154
        if !utils.ObjectHasAnnotationKey(desiredNodeState, consts.NodeStateDrainAnnotationCurrent) ||
1✔
155
                !utils.ObjectHasAnnotationKey(desiredNodeState, consts.NodeStateDrainAnnotation) {
1✔
156
                reqLogger.V(2).Info("NodeState doesn't have the current drain annotation")
×
157
                return ctrl.Result{}, nil
×
158
        }
×
159

160
        latest := desiredNodeState.GetGeneration()
1✔
161
        current := desiredNodeState.DeepCopy()
1✔
162
        reqLogger.V(0).Info("new generation", "generation", latest)
1✔
163

1✔
164
        // Update the nodeState Status object with the existing network state (interfaces bridges and rdma status)
1✔
165
        err = dn.updateStatusFromHost(desiredNodeState)
1✔
166
        if err != nil {
1✔
167
                reqLogger.Error(err, "failed to get host network status")
×
168
                return ctrl.Result{}, err
×
169
        }
×
170

171
        // if we are running in systemd mode we want to get the sriov result from the config-daemon that runs in systemd
172
        sriovResult, sriovResultExists, err := dn.checkSystemdStatus()
1✔
173
        //TODO: in the case we need to think what to do if we try to apply again or not
1✔
174
        if err != nil {
1✔
175
                reqLogger.Error(err, "failed to check systemd status unexpected error")
×
176
                err = dn.updateSyncState(ctx, desiredNodeState, consts.SyncStatusFailed, "unexpected error")
×
177
                if err != nil {
×
178
                        reqLogger.Error(err, "failed to update nodeState status")
×
179
                        return ctrl.Result{}, err
×
180
                }
×
181

182
                return ctrl.Result{}, nil
×
183
        }
184

185
        // if we are on the latest generation make a refresh on the nics
186
        if dn.lastAppliedGeneration == latest {
2✔
187
                isDrifted, err := dn.checkHostStateDrift(ctx, desiredNodeState)
1✔
188
                if err != nil {
1✔
189
                        reqLogger.Error(err, "failed to refresh host state")
×
190
                        return ctrl.Result{}, err
×
191
                }
×
192

193
                // if there are no host state drift changes, and we are on the latest applied policy
194
                // we check if we need to publish a new nodeState status if not we requeue
195
                if !isDrifted {
2✔
196
                        shouldUpdate := dn.shouldUpdateStatus(current, desiredNodeState)
1✔
197
                        if shouldUpdate {
1✔
198
                                reqLogger.Info("updating nodeState with new host status")
×
199
                                err = dn.updateSyncState(ctx, desiredNodeState, desiredNodeState.Status.SyncStatus, desiredNodeState.Status.LastSyncError)
×
200
                                if err != nil {
×
201
                                        reqLogger.Error(err, "failed to update nodeState new host status")
×
202
                                        return ctrl.Result{}, err
×
203
                                }
×
204
                        }
205

206
                        return ctrl.Result{RequeueAfter: consts.DaemonRequeueTime}, nil
1✔
207
                }
208
        }
209

210
        // set sync state to inProgress, but we don't clear the failed status
211
        err = dn.updateSyncState(ctx, desiredNodeState, consts.SyncStatusInProgress, desiredNodeState.Status.LastSyncError)
1✔
212
        if err != nil {
1✔
UNCOV
213
                reqLogger.Error(err, "failed to update sync status to inProgress")
×
UNCOV
214
                return ctrl.Result{}, err
×
UNCOV
215
        }
×
216

217
        reqReboot, reqDrain, err := dn.checkOnNodeStateChange(desiredNodeState)
1✔
218
        if err != nil {
1✔
219
                return ctrl.Result{}, err
×
220
        }
×
221

222
        if vars.UsingSystemdMode {
1✔
223
                // When running using systemd check if the applied configuration is the latest one
×
224
                // or there is a new config we need to apply
×
225
                // When using systemd configuration we write the file
×
226
                systemdConfModified, err := dn.writeSystemdConfigFile(desiredNodeState)
×
227
                if err != nil {
×
228
                        reqLogger.Error(err, "failed to write systemd config file")
×
229
                        return ctrl.Result{}, err
×
230
                }
×
231
                reqDrain = reqDrain || systemdConfModified || !sriovResultExists
×
232
                // require reboot if drain needed for systemd mode
×
233
                reqReboot = reqReboot || reqDrain
×
234
        }
235

236
        reqLogger.V(0).Info("aggregated daemon node state requirement",
1✔
237
                "drain-required", reqDrain, "reboot-required", reqReboot, "disable-drain", vars.DisableDrain)
1✔
238

1✔
239
        // handle drain only if the plugins request drain, or we are already in a draining request state
1✔
240
        if reqDrain ||
1✔
241
                !utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainIdle) {
2✔
242
                drainInProcess, err := dn.handleDrain(ctx, desiredNodeState, reqReboot)
1✔
243
                if err != nil {
1✔
244
                        reqLogger.Error(err, "failed to handle drain")
×
245
                        return ctrl.Result{}, err
×
246
                }
×
247

248
                // TODO: remove this after we stop using the node annotation
249
                // drain is still in progress we will still requeue the request in case there is an un-expect state in the draining
250
                // this will allow the daemon to try again.
251
                if drainInProcess {
2✔
252
                        reqLogger.Info("node drain still in progress, requeue")
1✔
253
                        return ctrl.Result{RequeueAfter: consts.DaemonRequeueTime}, nil
1✔
254
                }
1✔
255
        }
256

257
        // if we finish the drain we should run apply here
258
        if dn.isDrainCompleted(reqDrain, desiredNodeState) {
2✔
259
                return dn.apply(ctx, desiredNodeState, reqReboot, sriovResult)
1✔
260
        }
1✔
261

262
        return ctrl.Result{}, nil
×
263
}
264

265
// checkOnNodeStateChange checks the state change required for the node based on the desired SriovNetworkNodeState.
266
// The function iterates over all loaded plugins and calls their OnNodeStateChange method with the desired state.
267
// It returns two boolean values indicating whether a reboot or drain operation is required.
268
func (dn *NodeReconciler) checkOnNodeStateChange(desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, bool, error) {
1✔
269
        funcLog := log.Log.WithName("checkOnNodeStateChange")
1✔
270
        // Check the main plugin for changes
1✔
271
        reqDrain, reqReboot, err := dn.mainPlugin.OnNodeStateChange(desiredNodeState)
1✔
272
        if err != nil {
1✔
273
                funcLog.Error(err, "OnNodeStateChange plugin error", "mainPluginName", dn.mainPlugin.Name)
×
274
                return false, false, err
×
275
        }
×
276
        funcLog.V(0).Info("OnNodeStateChange result",
1✔
277
                "main plugin name", dn.mainPlugin.Name(),
1✔
278
                "drain-required", reqDrain,
1✔
279
                "reboot-required", reqReboot)
1✔
280

1✔
281
        // check if any of the plugins required to drain or reboot the node
1✔
282
        for _, p := range dn.additionalPlugins {
1✔
283
                d, r, err := p.OnNodeStateChange(desiredNodeState)
×
284
                if err != nil {
×
285
                        funcLog.Error(err, "OnNodeStateChange plugin error", "pluginName", p.Name())
×
286
                        return false, false, err
×
287
                }
×
288
                funcLog.V(0).Info("OnNodeStateChange result",
×
289
                        "pluginName", p.Name(),
×
290
                        "drain-required", d,
×
291
                        "reboot-required", r)
×
292
                reqDrain = reqDrain || d
×
293
                reqReboot = reqReboot || r
×
294
        }
295

296
        return reqReboot, reqDrain, nil
1✔
297
}
298

299
// checkSystemdStatus Checks the status of systemd services on the host node.
300
// return the sriovResult struct a boolean if the result file exist on the node
301
func (dn *NodeReconciler) checkSystemdStatus() (*hosttypes.SriovResult, bool, error) {
1✔
302
        if !vars.UsingSystemdMode {
2✔
303
                return nil, false, nil
1✔
304
        }
1✔
305

306
        funcLog := log.Log.WithName("checkSystemdStatus")
×
307
        serviceEnabled, err := dn.hostHelpers.IsServiceEnabled(consts.SriovServicePath)
×
308
        if err != nil {
×
309
                funcLog.Error(err, "failed to check if sriov-config service exist on host")
×
310
                return nil, false, err
×
311
        }
×
312
        postNetworkServiceEnabled, err := dn.hostHelpers.IsServiceEnabled(consts.SriovPostNetworkServicePath)
×
313
        if err != nil {
×
314
                funcLog.Error(err, "failed to check if sriov-config-post-network service exist on host")
×
315
                return nil, false, err
×
316
        }
×
317

318
        // if the service doesn't exist we should continue to let the k8s plugin to create the service files
319
        // this is only for k8s base environments, for openshift the sriov-operator creates a machine config to will apply
320
        // the system service and reboot the node the config-daemon doesn't need to do anything.
321
        sriovResult := &hosttypes.SriovResult{SyncStatus: consts.SyncStatusFailed,
×
322
                LastSyncError: fmt.Sprintf("some sriov systemd services are not available on node: "+
×
323
                        "sriov-config available:%t, sriov-config-post-network available:%t", serviceEnabled, postNetworkServiceEnabled)}
×
324
        exist := false
×
325

×
326
        // check if the service exist
×
327
        if serviceEnabled && postNetworkServiceEnabled {
×
328
                exist = true
×
329
                sriovResult, err = dn.hostHelpers.ReadSriovResult()
×
330
                if err != nil {
×
331
                        funcLog.Error(err, "failed to load sriov result file from host")
×
332
                        return nil, false, err
×
333
                }
×
334
        }
335
        return sriovResult, exist, nil
×
336
}
337

338
// apply applies the desired state of the node by:
339
// 1. Applying vendor plugins that have been loaded.
340
// 2. Depending on whether a reboot is required or if the configuration is being done via systemd, it applies the generic or virtual plugin(s).
341
// 3. Rebooting the node if necessary and sending an event.
342
// 4. Restarting the device plugin pod on the node.
343
// 5. Requesting annotation updates for draining the idle state of the node.
344
// 6. Synchronizing with the host network status and updating the sync status of the node in the nodeState object.
345
// 7. Updating the lastAppliedGeneration to the current generation.
346
func (dn *NodeReconciler) apply(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState, reqReboot bool, sriovResult *hosttypes.SriovResult) (ctrl.Result, error) {
1✔
347
        reqLogger := log.FromContext(ctx).WithName("Apply")
1✔
348
        // apply the additional plugins after we are done with drain if needed
1✔
349
        for _, p := range dn.additionalPlugins {
1✔
350
                err := p.Apply()
×
351
                if err != nil {
×
352
                        reqLogger.Error(err, "plugin Apply failed", "plugin-name", p.Name())
×
353
                        return ctrl.Result{}, err
×
354
                }
×
355
        }
356

357
        // if we don't need to reboot, or we are not doing the configuration in systemd
358
        // we apply the main plugin
359
        if !reqReboot && !vars.UsingSystemdMode && dn.mainPlugin != nil {
2✔
360
                err := dn.mainPlugin.Apply()
1✔
361
                if err != nil {
1✔
362
                        reqLogger.Error(err, "plugin Apply failed", "plugin-name", dn.mainPlugin.Name())
×
363
                        return ctrl.Result{}, err
×
364
                }
×
365
        }
366

367
        if reqReboot {
1✔
368
                reqLogger.Info("reboot node")
×
369
                dn.eventRecorder.SendEvent(ctx, "RebootNode", "Reboot node has been initiated")
×
370
                return ctrl.Result{}, dn.rebootNode()
×
371
        }
×
372

373
        if err := dn.restartDevicePluginPod(ctx); err != nil {
1✔
374
                reqLogger.Error(err, "failed to restart device plugin on the node")
×
375
                return ctrl.Result{}, err
×
376
        }
×
377

378
        err := dn.annotate(ctx, desiredNodeState, consts.DrainIdle)
1✔
379
        if err != nil {
2✔
380
                reqLogger.Error(err, "failed to request annotation update to idle")
1✔
381
                return ctrl.Result{}, err
1✔
382
        }
1✔
383

384
        reqLogger.Info("sync succeeded")
1✔
385
        syncStatus := consts.SyncStatusSucceeded
1✔
386
        lastSyncError := ""
1✔
387
        if vars.UsingSystemdMode {
1✔
388
                syncStatus = sriovResult.SyncStatus
×
389
                lastSyncError = sriovResult.LastSyncError
×
390
        }
×
391

392
        // Update the nodeState Status object with the existing network interfaces
393
        err = dn.updateStatusFromHost(desiredNodeState)
1✔
394
        if err != nil {
1✔
395
                reqLogger.Error(err, "failed to get host network status")
×
396
                return ctrl.Result{}, err
×
397
        }
×
398

399
        err = dn.updateSyncState(ctx, desiredNodeState, syncStatus, lastSyncError)
1✔
400
        if err != nil {
1✔
401
                reqLogger.Error(err, "failed to update sync status")
×
402
                return ctrl.Result{}, err
×
403
        }
×
404

405
        // update the lastAppliedGeneration
406
        dn.lastAppliedGeneration = desiredNodeState.Generation
1✔
407
        return ctrl.Result{RequeueAfter: consts.DaemonRequeueTime}, nil
1✔
408
}
409

410
// checkHostStateDrift returns true if the node state drifted from the nodeState policy
411
// Check if there is a change in the host network interfaces that require a reconfiguration by the daemon
412
func (dn *NodeReconciler) checkHostStateDrift(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
1✔
413
        funcLog := log.Log.WithName("checkHostStateDrift()")
1✔
414

1✔
415
        // Skip when SriovNetworkNodeState object has just been created.
1✔
416
        if desiredNodeState.GetGeneration() == 1 && len(desiredNodeState.Spec.Interfaces) == 0 {
1✔
417
                err := dn.hostHelpers.ClearPCIAddressFolder()
×
418
                if err != nil {
×
419
                        funcLog.Error(err, "failed to clear the PCI address configuration")
×
420
                        return false, err
×
421
                }
×
422

423
                funcLog.V(0).Info("interface policy spec not yet set by controller for sriovNetworkNodeState",
×
424
                        "name", desiredNodeState.Name)
×
425
                if desiredNodeState.Status.SyncStatus != consts.SyncStatusSucceeded ||
×
426
                        desiredNodeState.Status.LastSyncError != "" {
×
427
                        err = dn.updateSyncState(ctx, desiredNodeState, consts.SyncStatusSucceeded, "")
×
428
                }
×
429
                return false, err
×
430
        }
431

432
        // Verify changes in the status of the SriovNetworkNodeState CR.
433
        log.Log.V(0).Info("verifying interfaces status change")
1✔
434
        if dn.mainPlugin != nil {
2✔
435
                log.Log.V(2).Info("verifying status change for plugin", "pluginName", dn.mainPlugin.Name())
1✔
436
                changed, err := dn.mainPlugin.CheckStatusChanges(desiredNodeState)
1✔
437
                if err != nil {
1✔
438
                        return false, err
×
439
                }
×
440
                if changed {
1✔
441
                        log.Log.V(0).Info("plugin require change", "pluginName", dn.mainPlugin.Name())
×
442
                        return true, nil
×
443
                }
×
444
        }
445

446
        for _, p := range dn.additionalPlugins {
1✔
447
                // Verify changes in the status of the SriovNetworkNodeState CR.
×
448
                log.Log.V(2).Info("verifying status change for plugin", "pluginName", p.Name())
×
449
                changed, err := p.CheckStatusChanges(desiredNodeState)
×
450
                if err != nil {
×
451
                        return false, err
×
452
                }
×
453
                if changed {
×
454
                        log.Log.V(0).Info("plugin require change", "pluginName", p.Name())
×
455
                        return true, nil
×
456
                }
×
457
        }
458

459
        log.Log.V(0).Info("Interfaces not changed")
1✔
460
        return false, nil
1✔
461
}
462

463
// writeSystemdConfigFile Writes the systemd configuration file for the node
464
// and handles any necessary actions such as removing an existing result file and writing supported NIC IDs.
465
//
466
//        The function first attempts to write the systemd configuration file based on the desired node state.
467
//        If successful, it checks if the configuration file was modified. If so, it removes the existing result file (if present) to ensure that outdated results are not used.
468
//        After writing the configuration file and potentially removing the old one, it writes a file containing supported NIC IDs.
469
func (dn *NodeReconciler) writeSystemdConfigFile(desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
×
470
        funcLog := log.Log.WithName("writeSystemdConfigFile()")
×
471
        funcLog.V(0).Info("writing systemd config file to host")
×
472
        systemdConfModified, err := dn.hostHelpers.WriteConfFile(desiredNodeState)
×
473
        if err != nil {
×
474
                funcLog.Error(err, "failed to write configuration file for systemd mode")
×
475
                return false, err
×
476
        }
×
477
        if systemdConfModified {
×
478
                // remove existing result file to make sure that we will not use outdated result, e.g. in case if
×
479
                // systemd service was not triggered for some reason
×
480
                err = dn.hostHelpers.RemoveSriovResult()
×
481
                if err != nil {
×
482
                        funcLog.Error(err, "failed to remove result file for systemd mode")
×
483
                        return false, err
×
484
                }
×
485
        }
486

487
        err = dn.hostHelpers.WriteSriovSupportedNics()
×
488
        if err != nil {
×
489
                funcLog.Error(err, "failed to write supported nic ids file for systemd mode")
×
490
                return false, err
×
491
        }
×
492

493
        funcLog.V(0).Info("systemd mode WriteConfFile results",
×
494
                "drain-required", systemdConfModified, "reboot-required", systemdConfModified)
×
495
        return systemdConfModified, nil
×
496
}
497

498
// handleDrain: adds the right annotation to the node and nodeState object
499
// returns true if we need to finish the reconcile loop and wait for a new object
500
func (dn *NodeReconciler) handleDrain(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState, reqReboot bool) (bool, error) {
1✔
501
        funcLog := log.Log.WithName("handleDrain")
1✔
502
        // done with the drain we can continue with the configuration
1✔
503
        if utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainComplete) {
2✔
504
                funcLog.Info("the node complete the draining")
1✔
505
                return false, nil
1✔
506
        }
1✔
507

508
        // the operator is still draining the node so we reconcile
509
        if utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.Draining) {
1✔
510
                funcLog.Info("the node is still draining")
×
511
                return true, nil
×
512
        }
×
513

514
        // drain is disabled we continue with the configuration
515
        if vars.DisableDrain {
2✔
516
                funcLog.Info("drain is disabled in sriovOperatorConfig")
1✔
517
                return false, nil
1✔
518
        }
1✔
519

520
        // annotate both node and node state with drain or reboot
521
        annotation := consts.DrainRequired
1✔
522
        if reqReboot {
1✔
523
                annotation = consts.RebootRequired
×
524
        }
×
525
        return true, dn.annotate(ctx, desiredNodeState, annotation)
1✔
526
}
527

528
// restartDevicePluginPod restarts the device plugin pod on the specified node.
529
//
530
// The function checks if the pod exists, deletes it if found, and waits for it to be deleted successfully.
531
func (dn *NodeReconciler) restartDevicePluginPod(ctx context.Context) error {
1✔
532
        log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod")
1✔
533
        pods := &corev1.PodList{}
1✔
534
        err := dn.client.List(ctx, pods, &client.ListOptions{
1✔
535
                Namespace: vars.Namespace, Raw: &metav1.ListOptions{
1✔
536
                        LabelSelector:   "app=sriov-device-plugin",
1✔
537
                        FieldSelector:   "spec.nodeName=" + vars.NodeName,
1✔
538
                        ResourceVersion: "0",
1✔
539
                }})
1✔
540
        if err != nil {
1✔
541
                if errors.IsNotFound(err) {
×
542
                        log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
×
543
                        return nil
×
544
                }
×
545
                log.Log.Error(err, "restartDevicePluginPod(): Failed to list device plugin pod, retrying")
×
546
                return err
×
547
        }
548

549
        if len(pods.Items) == 0 {
2✔
550
                log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
1✔
551
                return nil
1✔
552
        }
1✔
553

554
        for _, pod := range pods.Items {
×
555
                log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", pod.Name)
×
556
                err = dn.client.Delete(ctx, &pod)
×
557
                if errors.IsNotFound(err) {
×
558
                        log.Log.Info("restartDevicePluginPod(): pod to delete not found")
×
559
                        continue
×
560
                }
561
                if err != nil {
×
562
                        log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
×
563
                        return err
×
564
                }
×
565

566
                tmpPod := &corev1.Pod{}
×
567
                if err := wait.PollUntilContextCancel(ctx, 3*time.Second, true, func(ctx context.Context) (bool, error) {
×
568
                        err := dn.client.Get(ctx, client.ObjectKeyFromObject(&pod), tmpPod)
×
569
                        if errors.IsNotFound(err) {
×
570
                                log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
×
571
                                return true, nil
×
572
                        }
×
573

574
                        if err != nil {
×
575
                                log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
×
576
                        } else {
×
577
                                log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", pod.Name)
×
578
                        }
×
579
                        return false, nil
×
580
                }); err != nil {
×
581
                        log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
×
582
                        return err
×
583
                }
×
584
        }
585

586
        return nil
×
587
}
588

589
// rebootNode Reboots the node by executing a systemd-run command
590
func (dn *NodeReconciler) rebootNode() error {
×
591
        funcLog := log.Log.WithName("rebootNode")
×
592
        funcLog.Info("trigger node reboot")
×
593
        exit, err := dn.hostHelpers.Chroot(consts.Host)
×
594
        if err != nil {
×
595
                funcLog.Error(err, "chroot command failed")
×
596
                return err
×
597
        }
×
598
        defer exit()
×
599
        // creates a new transient systemd unit to reboot the system.
×
600
        // We explictily try to stop kubelet.service first, before anything else; this
×
601
        // way we ensure the rest of system stays running, because kubelet may need
×
602
        // to do "graceful" shutdown by e.g. de-registering with a load balancer.
×
603
        // However note we use `;` instead of `&&` so we keep rebooting even
×
604
        // if kubelet failed to shutdown - that way the machine will still eventually reboot
×
605
        // as systemd will time out the stop invocation.
×
606
        stdOut, StdErr, err := dn.hostHelpers.RunCommand("systemd-run", "--unit", "sriov-network-config-daemon-reboot",
×
607
                "--description", "sriov-network-config-daemon reboot node", "/bin/sh", "-c", "systemctl stop kubelet.service; reboot")
×
608

×
609
        if err != nil {
×
610
                funcLog.Error(err, "failed to reboot node", "stdOut", stdOut, "StdErr", StdErr)
×
611
                return err
×
612
        }
×
613
        return nil
×
614
}
615

616
// isDrainCompleted returns true if the current-state annotation is drain completed
617
func (dn *NodeReconciler) isDrainCompleted(reqDrain bool, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) bool {
1✔
618
        if vars.DisableDrain {
2✔
619
                return true
1✔
620
        }
1✔
621

622
        // if we need to drain check the drain status
623
        if reqDrain {
2✔
624
                return utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainComplete)
1✔
625
        }
1✔
626

627
        // check in case a reboot was requested and the second run doesn't require a drain
628
        if !utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotation, consts.DrainIdle) {
1✔
629
                return utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainComplete)
×
630
        }
×
631

632
        // if we don't need to drain at all just return true so we can apply the configuration
633
        return true
1✔
634
}
635

636
// annotate annotates the nodeState object with specified annotation.
637
func (dn *NodeReconciler) annotate(
638
        ctx context.Context,
639
        desiredNodeState *sriovnetworkv1.SriovNetworkNodeState,
640
        annotationState string) error {
1✔
641
        funcLog := log.Log.WithName("annotate")
1✔
642

1✔
643
        funcLog.Info(fmt.Sprintf("apply '%s' annotation for node", annotationState))
1✔
644
        if err := utils.AnnotateNode(ctx,
1✔
645
                desiredNodeState.Name,
1✔
646
                consts.NodeDrainAnnotation,
1✔
647
                annotationState, dn.client); err != nil {
2✔
648
                funcLog.Error(err, "Failed to annotate node")
1✔
649
                return err
1✔
650
        }
1✔
651

652
        funcLog.Info(fmt.Sprintf("apply '%s' annotation for nodeState", annotationState))
1✔
653
        if err := utils.AnnotateObject(ctx, desiredNodeState,
1✔
654
                consts.NodeStateDrainAnnotation,
1✔
655
                annotationState, dn.client); err != nil {
1✔
656
                funcLog.Error(err, "Failed to annotate nodeState")
×
657
                return err
×
658
        }
×
659

660
        // the node was annotated we need to wait for the operator to finish the drain
661
        return nil
1✔
662
}
663

664
// SetupWithManager sets up the controller with the Manager.
665
func (dn *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error {
1✔
666
        return ctrl.NewControllerManagedBy(mgr).
1✔
667
                For(&sriovnetworkv1.SriovNetworkNodeState{}).
1✔
668
                WithEventFilter(predicate.Or(predicate.AnnotationChangedPredicate{}, predicate.GenerationChangedPredicate{})).
1✔
669
                WithOptions(controller.Options{MaxConcurrentReconciles: 1}).
1✔
670
                Complete(dn)
1✔
671
}
1✔
672

673
// -------------------------------------
674
// ---- unit tests helper function -----
675
// -------------------------------------
676

677
func (dn *NodeReconciler) GetLastAppliedGeneration() int64 {
1✔
678
        return dn.lastAppliedGeneration
1✔
679
}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc