• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

k8snetworkplumbingwg / sriov-network-operator / 18003884172

25 Sep 2025 09:59AM UTC coverage: 61.951% (-0.09%) from 62.036%
18003884172

Pull #909

github

web-flow
Merge a62c78a96 into c49a32c97
Pull Request #909: Add hugepages func test

8683 of 14016 relevant lines covered (61.95%)

0.69 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

48.39
/pkg/daemon/daemon.go
1
package daemon
2

3
import (
4
        "context"
5
        "fmt"
6
        "time"
7

8
        corev1 "k8s.io/api/core/v1"
9
        "k8s.io/apimachinery/pkg/api/errors"
10
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
11
        "k8s.io/apimachinery/pkg/util/wait"
12
        ctrl "sigs.k8s.io/controller-runtime"
13
        "sigs.k8s.io/controller-runtime/pkg/client"
14
        "sigs.k8s.io/controller-runtime/pkg/controller"
15
        "sigs.k8s.io/controller-runtime/pkg/log"
16
        "sigs.k8s.io/controller-runtime/pkg/predicate"
17

18
        sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
19
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
20
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
21
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper"
22
        hosttypes "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/host/types"
23
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms"
24
        plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins"
25
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils"
26
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
27
)
28

29
// NodeReconciler struct holds various components necessary for reconciling an SR-IOV node.
30
// It includes a Kubernetes client, SR-IOV client, and other utility interfaces.
31
// The struct is designed to manage the lifecycle of an SR-IOV devices on a given node.
32
type NodeReconciler struct {
33
        client client.Client
34

35
        HostHelpers helper.HostHelpersInterface
36

37
        platformHelpers platforms.Interface
38

39
        eventRecorder *EventRecorder
40

41
        featureGate featuregate.FeatureGate
42

43
        // list of disabled plugins
44
        disabledPlugins []string
45

46
        loadedPlugins         map[string]plugin.VendorPlugin
47
        lastAppliedGeneration int64
48
}
49

50
// New creates a new instance of NodeReconciler.
51
func New(
52
        client client.Client,
53
        hostHelpers helper.HostHelpersInterface,
54
        platformHelper platforms.Interface,
55
        er *EventRecorder,
56
        featureGates featuregate.FeatureGate,
57
        disabledPlugins []string,
58
) *NodeReconciler {
1✔
59
        return &NodeReconciler{
1✔
60
                client:          client,
1✔
61
                HostHelpers:     hostHelpers,
1✔
62
                platformHelpers: platformHelper,
1✔
63

1✔
64
                lastAppliedGeneration: 0,
1✔
65
                eventRecorder:         er,
1✔
66
                featureGate:           featureGates,
1✔
67
                disabledPlugins:       disabledPlugins,
1✔
68
        }
1✔
69
}
1✔
70

71
// Init initializes the Sriov Network Operator daemon.
72
// It enables kernel modules, prepare udev rules and load the host network state
73
func (dn *NodeReconciler) Init() error {
1✔
74
        funcLog := log.Log.WithName("Init")
1✔
75
        var err error
1✔
76

1✔
77
        if !vars.UsingSystemdMode {
2✔
78
                funcLog.V(0).Info("daemon running in daemon mode")
1✔
79
                _, err = dn.HostHelpers.CheckRDMAEnabled()
1✔
80
                if err != nil {
1✔
81
                        funcLog.Error(err, "warning, failed to check RDMA state")
×
82
                }
×
83
                dn.HostHelpers.TryEnableTun()
1✔
84
                dn.HostHelpers.TryEnableVhostNet()
1✔
85
                err = dn.HostHelpers.CleanSriovFilesFromHost(vars.ClusterType == consts.ClusterTypeOpenshift)
1✔
86
                if err != nil {
1✔
87
                        funcLog.Error(err, "failed to remove all the systemd sriov files")
×
88
                }
×
89
        } else {
×
90
                funcLog.V(0).Info("Run(): daemon running in systemd mode")
×
91
        }
×
92

93
        if err := dn.prepareNMUdevRule(); err != nil {
1✔
94
                funcLog.Error(err, "failed to prepare udev files to disable network manager on requested VFs")
×
95
        }
×
96
        if err := dn.HostHelpers.PrepareVFRepUdevRule(); err != nil {
1✔
97
                funcLog.Error(err, "failed to prepare udev files to rename VF representors for requested VFs")
×
98
        }
×
99

100
        // init openstack info
101
        if vars.PlatformType == consts.VirtualOpenStack {
1✔
102
                ns, err := dn.HostHelpers.GetCheckPointNodeState()
×
103
                if err != nil {
×
104
                        return err
×
105
                }
×
106

107
                if ns == nil {
×
108
                        err = dn.platformHelpers.CreateOpenstackDevicesInfo()
×
109
                        if err != nil {
×
110
                                return err
×
111
                        }
×
112
                } else {
×
113
                        dn.platformHelpers.CreateOpenstackDevicesInfoFromNodeStatus(ns)
×
114
                }
×
115
        }
116

117
        // get interfaces
118
        ns := &sriovnetworkv1.SriovNetworkNodeState{}
1✔
119
        err = dn.updateStatusFromHost(ns)
1✔
120
        if err != nil {
1✔
121
                funcLog.Error(err, "failed to get host network status on init")
×
122
                return err
×
123
        }
×
124

125
        // init vendor plugins
126
        dn.loadedPlugins, err = loadPlugins(ns, dn.HostHelpers, dn.disabledPlugins)
1✔
127
        if err != nil {
1✔
128
                funcLog.Error(err, "failed to load vendor plugins")
×
129
                return err
×
130
        }
×
131

132
        // save init state
133
        err = dn.HostHelpers.WriteCheckpointFile(ns)
1✔
134
        if err != nil {
1✔
135
                funcLog.Error(err, "failed to write checkpoint file on host")
×
136
        }
×
137
        return err
1✔
138
}
139

140
// Reconcile Reconciles the nodeState object by performing the following steps:
141
// 1. Retrieves the latest NodeState from the API server.
142
// 2. Checks if the object has the required drain controller annotations for the current generation.
143
// 3. Updates the nodeState Status object with the existing network state (interfaces, bridges, and RDMA status).
144
// 4. If running in systemd mode, checks the sriov result from the config-daemon that runs in systemd.
145
// 5. Compares the latest generation with the last applied generation to determine if a refresh on NICs is needed.
146
// 6. Checks for drift between the host state and the nodeState status.
147
// 7. Updates the sync state of the nodeState object as per the current requirements.
148
// 8. Determines if a drain is required based on the current state of the nodeState.
149
// 9. Handles the drain if necessary, ensuring that it does not conflict with other drain requests.
150
// 10. Applies the changes to the nodeState if there are no issues and updates the sync status accordingly.
151
// 11. If a reboot is required after applying the changes, returns a result to trigger a reboot.
152
//
153
// Returns a Result indicating whether or not the controller should requeue the request for further processing.
154
func (dn *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
1✔
155
        reqLogger := log.FromContext(ctx).WithName("Reconcile")
1✔
156
        // Get the latest NodeState
1✔
157
        desiredNodeState := &sriovnetworkv1.SriovNetworkNodeState{}
1✔
158
        err := dn.client.Get(ctx, client.ObjectKey{Namespace: req.Namespace, Name: req.Name}, desiredNodeState)
1✔
159
        if err != nil {
2✔
160
                if errors.IsNotFound(err) {
2✔
161
                        reqLogger.Info("NodeState doesn't exist")
1✔
162
                        return ctrl.Result{}, nil
1✔
163
                }
1✔
164
                reqLogger.Error(err, "Failed to fetch node state", "name", vars.NodeName)
1✔
165
                return ctrl.Result{}, err
1✔
166
        }
167

168
        // Check the object as the drain controller annotations
169
        // if not just wait for the drain controller to add them before we start taking care of the nodeState
170
        if !utils.ObjectHasAnnotationKey(desiredNodeState, consts.NodeStateDrainAnnotationCurrent) ||
1✔
171
                !utils.ObjectHasAnnotationKey(desiredNodeState, consts.NodeStateDrainAnnotation) {
1✔
172
                reqLogger.V(2).Info("NodeState doesn't have the current drain annotation")
×
173
                return ctrl.Result{}, nil
×
174
        }
×
175

176
        latest := desiredNodeState.GetGeneration()
1✔
177
        current := desiredNodeState.DeepCopy()
1✔
178
        reqLogger.V(0).Info("new generation", "generation", latest)
1✔
179

1✔
180
        // Update the nodeState Status object with the existing network state (interfaces bridges and rdma status)
1✔
181
        err = dn.updateStatusFromHost(desiredNodeState)
1✔
182
        if err != nil {
1✔
183
                reqLogger.Error(err, "failed to get host network status")
×
184
                return ctrl.Result{}, err
×
185
        }
×
186

187
        // if we are running in systemd mode we want to get the sriov result from the config-daemon that runs in systemd
188
        sriovResult, sriovResultExists, err := dn.checkSystemdStatus()
1✔
189
        //TODO: in the case we need to think what to do if we try to apply again or not
1✔
190
        if err != nil {
1✔
191
                reqLogger.Error(err, "failed to check systemd status unexpected error")
×
192
                err = dn.updateSyncState(ctx, desiredNodeState, consts.SyncStatusFailed, "unexpected error")
×
193
                if err != nil {
×
194
                        reqLogger.Error(err, "failed to update nodeState status")
×
195
                        return ctrl.Result{}, err
×
196
                }
×
197

198
                return ctrl.Result{}, nil
×
199
        }
200

201
        // if we are on the latest generation make a refresh on the nics
202
        if dn.lastAppliedGeneration == latest {
2✔
203
                isDrifted, err := dn.checkHostStateDrift(ctx, desiredNodeState)
1✔
204
                if err != nil {
1✔
205
                        reqLogger.Error(err, "failed to refresh host state")
×
206
                        return ctrl.Result{}, err
×
207
                }
×
208

209
                // if there are no host state drift changes, and we are on the latest applied policy
210
                // we check if we need to publish a new nodeState status if not we requeue
211
                if !isDrifted {
2✔
212
                        shouldUpdate := dn.shouldUpdateStatus(current, desiredNodeState)
1✔
213
                        if shouldUpdate {
1✔
214
                                reqLogger.Info("updating nodeState with new host status")
×
215
                                err = dn.updateSyncState(ctx, desiredNodeState, desiredNodeState.Status.SyncStatus, desiredNodeState.Status.LastSyncError)
×
216
                                if err != nil {
×
217
                                        reqLogger.Error(err, "failed to update nodeState new host status")
×
218
                                        return ctrl.Result{}, err
×
219
                                }
×
220
                        }
221

222
                        return ctrl.Result{RequeueAfter: consts.DaemonRequeueTime}, nil
1✔
223
                }
224
        }
225

226
        // set sync state to inProgress, but we don't clear the failed status
227
        err = dn.updateSyncState(ctx, desiredNodeState, consts.SyncStatusInProgress, desiredNodeState.Status.LastSyncError)
1✔
228
        if err != nil {
1✔
229
                reqLogger.Error(err, "failed to update sync status to inProgress")
×
230
                return ctrl.Result{}, err
×
231
        }
×
232

233
        reqReboot, reqDrain, err := dn.checkOnNodeStateChange(desiredNodeState)
1✔
234
        if err != nil {
1✔
235
                return ctrl.Result{}, err
×
236
        }
×
237

238
        if vars.UsingSystemdMode {
1✔
239
                // When running using systemd check if the applied configuration is the latest one
×
240
                // or there is a new config we need to apply
×
241
                // When using systemd configuration we write the file
×
242
                systemdConfModified, err := dn.writeSystemdConfigFile(desiredNodeState)
×
243
                if err != nil {
×
244
                        reqLogger.Error(err, "failed to write systemd config file")
×
245
                        return ctrl.Result{}, err
×
246
                }
×
247
                reqDrain = reqDrain || systemdConfModified || !sriovResultExists
×
248
                // require reboot if drain needed for systemd mode
×
249
                reqReboot = reqReboot || reqDrain
×
250
        }
251

252
        reqLogger.V(0).Info("aggregated daemon node state requirement",
1✔
253
                "drain-required", reqDrain, "reboot-required", reqReboot, "disable-drain", vars.DisableDrain)
1✔
254

1✔
255
        // handle drain only if the plugins request drain, or we are already in a draining request state
1✔
256
        if reqDrain ||
1✔
257
                !utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainIdle) {
2✔
258
                drainInProcess, err := dn.handleDrain(ctx, desiredNodeState, reqReboot)
1✔
259
                if err != nil {
1✔
260
                        reqLogger.Error(err, "failed to handle drain")
×
261
                        return ctrl.Result{}, err
×
262
                }
×
263

264
                // TODO: remove this after we stop using the node annotation
265
                // drain is still in progress we will still requeue the request in case there is an un-expect state in the draining
266
                // this will allow the daemon to try again.
267
                if drainInProcess {
2✔
268
                        reqLogger.Info("node drain still in progress, requeue")
1✔
269
                        return ctrl.Result{RequeueAfter: consts.DaemonRequeueTime}, nil
1✔
270
                }
1✔
271
        }
272

273
        // if we finish the drain we should run apply here
274
        if dn.isDrainCompleted(reqDrain, desiredNodeState) {
2✔
275
                return dn.apply(ctx, desiredNodeState, reqReboot, sriovResult)
1✔
276
        }
1✔
277

278
        return ctrl.Result{}, nil
×
279
}
280

281
// checkOnNodeStateChange checks the state change required for the node based on the desired SriovNetworkNodeState.
282
// The function iterates over all loaded plugins and calls their OnNodeStateChange method with the desired state.
283
// It returns two boolean values indicating whether a reboot or drain operation is required.
284
func (dn *NodeReconciler) checkOnNodeStateChange(desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, bool, error) {
1✔
285
        funcLog := log.Log.WithName("checkOnNodeStateChange")
1✔
286
        reqReboot := false
1✔
287
        reqDrain := false
1✔
288

1✔
289
        // check if any of the plugins required to drain or reboot the node
1✔
290
        for k, p := range dn.loadedPlugins {
2✔
291
                d, r, err := p.OnNodeStateChange(desiredNodeState)
1✔
292
                if err != nil {
1✔
293
                        funcLog.Error(err, "OnNodeStateChange plugin error", "plugin-name", k)
×
294
                        return false, false, err
×
295
                }
×
296
                funcLog.V(0).Info("OnNodeStateChange result",
1✔
297
                        "plugin", k,
1✔
298
                        "drain-required", d,
1✔
299
                        "reboot-required", r)
1✔
300
                reqDrain = reqDrain || d
1✔
301
                reqReboot = reqReboot || r
1✔
302
        }
303

304
        return reqReboot, reqDrain, nil
1✔
305
}
306

307
// checkSystemdStatus Checks the status of systemd services on the host node.
308
// return the sriovResult struct a boolean if the result file exist on the node
309
func (dn *NodeReconciler) checkSystemdStatus() (*hosttypes.SriovResult, bool, error) {
1✔
310
        if !vars.UsingSystemdMode {
2✔
311
                return nil, false, nil
1✔
312
        }
1✔
313

314
        funcLog := log.Log.WithName("checkSystemdStatus")
×
315
        serviceEnabled, err := dn.HostHelpers.IsServiceEnabled(consts.SriovServicePath)
×
316
        if err != nil {
×
317
                funcLog.Error(err, "failed to check if sriov-config service exist on host")
×
318
                return nil, false, err
×
319
        }
×
320
        postNetworkServiceEnabled, err := dn.HostHelpers.IsServiceEnabled(consts.SriovPostNetworkServicePath)
×
321
        if err != nil {
×
322
                funcLog.Error(err, "failed to check if sriov-config-post-network service exist on host")
×
323
                return nil, false, err
×
324
        }
×
325

326
        // if the service doesn't exist we should continue to let the k8s plugin to create the service files
327
        // this is only for k8s base environments, for openshift the sriov-operator creates a machine config to will apply
328
        // the system service and reboot the node the config-daemon doesn't need to do anything.
329
        sriovResult := &hosttypes.SriovResult{SyncStatus: consts.SyncStatusFailed,
×
330
                LastSyncError: fmt.Sprintf("some sriov systemd services are not available on node: "+
×
331
                        "sriov-config available:%t, sriov-config-post-network available:%t", serviceEnabled, postNetworkServiceEnabled)}
×
332
        exist := false
×
333

×
334
        // check if the service exist
×
335
        if serviceEnabled && postNetworkServiceEnabled {
×
336
                exist = true
×
337
                sriovResult, err = dn.HostHelpers.ReadSriovResult()
×
338
                if err != nil {
×
339
                        funcLog.Error(err, "failed to load sriov result file from host")
×
340
                        return nil, false, err
×
341
                }
×
342
        }
343
        return sriovResult, exist, nil
×
344
}
345

346
// apply applies the desired state of the node by:
347
// 1. Applying vendor plugins that have been loaded.
348
// 2. Depending on whether a reboot is required or if the configuration is being done via systemd, it applies the generic or virtual plugin(s).
349
// 3. Rebooting the node if necessary and sending an event.
350
// 4. Restarting the device plugin pod on the node.
351
// 5. Requesting annotation updates for draining the idle state of the node.
352
// 6. Synchronizing with the host network status and updating the sync status of the node in the nodeState object.
353
// 7. Updating the lastAppliedGeneration to the current generation.
354
func (dn *NodeReconciler) apply(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState, reqReboot bool, sriovResult *hosttypes.SriovResult) (ctrl.Result, error) {
1✔
355
        reqLogger := log.FromContext(ctx).WithName("Apply")
1✔
356
        // apply the vendor plugins after we are done with drain if needed
1✔
357
        for k, p := range dn.loadedPlugins {
2✔
358
                // Skip both the general and virtual plugin apply them last
1✔
359
                if k != GenericPluginName && k != VirtualPluginName {
2✔
360
                        err := p.Apply()
1✔
361
                        if err != nil {
1✔
362
                                reqLogger.Error(err, "plugin Apply failed", "plugin-name", k)
×
363
                                return ctrl.Result{}, err
×
364
                        }
×
365
                }
366
        }
367

368
        // if we don't need to reboot, or we are not doing the configuration in systemd
369
        // we apply the generic plugin
370
        if !reqReboot && !vars.UsingSystemdMode {
2✔
371
                // For BareMetal machines apply the generic plugin
1✔
372
                selectedPlugin, ok := dn.loadedPlugins[GenericPluginName]
1✔
373
                if ok {
2✔
374
                        // Apply generic plugin last
1✔
375
                        err := selectedPlugin.Apply()
1✔
376
                        if err != nil {
1✔
377
                                reqLogger.Error(err, "generic plugin fail to apply")
×
378
                                return ctrl.Result{}, err
×
379
                        }
×
380
                }
381

382
                // For Virtual machines apply the virtual plugin
383
                selectedPlugin, ok = dn.loadedPlugins[VirtualPluginName]
1✔
384
                if ok {
1✔
385
                        // Apply virtual plugin last
×
386
                        err := selectedPlugin.Apply()
×
387
                        if err != nil {
×
388
                                reqLogger.Error(err, "virtual plugin failed to apply")
×
389
                                return ctrl.Result{}, err
×
390
                        }
×
391
                }
392
        }
393

394
        if reqReboot {
1✔
395
                reqLogger.Info("reboot node")
×
396
                dn.eventRecorder.SendEvent(ctx, "RebootNode", "Reboot node has been initiated")
×
397
                return ctrl.Result{}, dn.rebootNode()
×
398
        }
×
399

400
        if err := dn.restartDevicePluginPod(ctx); err != nil {
1✔
401
                reqLogger.Error(err, "failed to restart device plugin on the node")
×
402
                return ctrl.Result{}, err
×
403
        }
×
404

405
        err := dn.annotate(ctx, desiredNodeState, consts.DrainIdle)
1✔
406
        if err != nil {
2✔
407
                reqLogger.Error(err, "failed to request annotation update to idle")
1✔
408
                return ctrl.Result{}, err
1✔
409
        }
1✔
410

411
        reqLogger.Info("sync succeeded")
1✔
412
        syncStatus := consts.SyncStatusSucceeded
1✔
413
        lastSyncError := ""
1✔
414
        if vars.UsingSystemdMode {
1✔
415
                syncStatus = sriovResult.SyncStatus
×
416
                lastSyncError = sriovResult.LastSyncError
×
417
        }
×
418

419
        // Update the nodeState Status object with the existing network interfaces
420
        err = dn.updateStatusFromHost(desiredNodeState)
1✔
421
        if err != nil {
1✔
422
                reqLogger.Error(err, "failed to get host network status")
×
423
                return ctrl.Result{}, err
×
424
        }
×
425

426
        err = dn.updateSyncState(ctx, desiredNodeState, syncStatus, lastSyncError)
1✔
427
        if err != nil {
1✔
428
                reqLogger.Error(err, "failed to update sync status")
×
429
                return ctrl.Result{}, err
×
430
        }
×
431

432
        // update the lastAppliedGeneration
433
        dn.lastAppliedGeneration = desiredNodeState.Generation
1✔
434
        return ctrl.Result{RequeueAfter: consts.DaemonRequeueTime}, nil
1✔
435
}
436

437
// checkHostStateDrift returns true if the node state drifted from the nodeState policy
438
// Check if there is a change in the host network interfaces that require a reconfiguration by the daemon
439
func (dn *NodeReconciler) checkHostStateDrift(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
1✔
440
        funcLog := log.Log.WithName("checkHostStateDrift()")
1✔
441

1✔
442
        // Skip when SriovNetworkNodeState object has just been created.
1✔
443
        if desiredNodeState.GetGeneration() == 1 && len(desiredNodeState.Spec.Interfaces) == 0 {
1✔
444
                err := dn.HostHelpers.ClearPCIAddressFolder()
×
445
                if err != nil {
×
446
                        funcLog.Error(err, "failed to clear the PCI address configuration")
×
447
                        return false, err
×
448
                }
×
449

450
                funcLog.V(0).Info("interface policy spec not yet set by controller for sriovNetworkNodeState",
×
451
                        "name", desiredNodeState.Name)
×
452
                if desiredNodeState.Status.SyncStatus != consts.SyncStatusSucceeded ||
×
453
                        desiredNodeState.Status.LastSyncError != "" {
×
454
                        err = dn.updateSyncState(ctx, desiredNodeState, consts.SyncStatusSucceeded, "")
×
455
                }
×
456
                return false, err
×
457
        }
458

459
        // Verify changes in the status of the SriovNetworkNodeState CR.
460
        log.Log.V(0).Info("verifying interfaces status change")
1✔
461
        for _, p := range dn.loadedPlugins {
2✔
462
                // Verify changes in the status of the SriovNetworkNodeState CR.
1✔
463
                log.Log.V(2).Info("verifying status change for plugin", "pluginName", p.Name())
1✔
464
                changed, err := p.CheckStatusChanges(desiredNodeState)
1✔
465
                if err != nil {
1✔
466
                        return false, err
×
467
                }
×
468
                if changed {
1✔
469
                        log.Log.V(0).Info("plugin require change", "pluginName", p.Name())
×
470
                        return true, nil
×
471
                }
×
472
        }
473

474
        log.Log.V(0).Info("Interfaces not changed")
1✔
475
        return false, nil
1✔
476
}
477

478
// writeSystemdConfigFile Writes the systemd configuration file for the node
479
// and handles any necessary actions such as removing an existing result file and writing supported NIC IDs.
480
//
481
//        The function first attempts to write the systemd configuration file based on the desired node state.
482
//        If successful, it checks if the configuration file was modified. If so, it removes the existing result file (if present) to ensure that outdated results are not used.
483
//        After writing the configuration file and potentially removing the old one, it writes a file containing supported NIC IDs.
484
func (dn *NodeReconciler) writeSystemdConfigFile(desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
×
485
        funcLog := log.Log.WithName("writeSystemdConfigFile()")
×
486
        funcLog.V(0).Info("writing systemd config file to host")
×
487
        systemdConfModified, err := dn.HostHelpers.WriteConfFile(desiredNodeState)
×
488
        if err != nil {
×
489
                funcLog.Error(err, "failed to write configuration file for systemd mode")
×
490
                return false, err
×
491
        }
×
492
        if systemdConfModified {
×
493
                // remove existing result file to make sure that we will not use outdated result, e.g. in case if
×
494
                // systemd service was not triggered for some reason
×
495
                err = dn.HostHelpers.RemoveSriovResult()
×
496
                if err != nil {
×
497
                        funcLog.Error(err, "failed to remove result file for systemd mode")
×
498
                        return false, err
×
499
                }
×
500
        }
501

502
        err = dn.HostHelpers.WriteSriovSupportedNics()
×
503
        if err != nil {
×
504
                funcLog.Error(err, "failed to write supported nic ids file for systemd mode")
×
505
                return false, err
×
506
        }
×
507

508
        funcLog.V(0).Info("systemd mode WriteConfFile results",
×
509
                "drain-required", systemdConfModified, "reboot-required", systemdConfModified)
×
510
        return systemdConfModified, nil
×
511
}
512

513
// handleDrain: adds the right annotation to the node and nodeState object
514
// returns true if we need to finish the reconcile loop and wait for a new object
515
func (dn *NodeReconciler) handleDrain(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState, reqReboot bool) (bool, error) {
1✔
516
        funcLog := log.Log.WithName("handleDrain")
1✔
517
        // done with the drain we can continue with the configuration
1✔
518
        if utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainComplete) {
2✔
519
                funcLog.Info("the node complete the draining")
1✔
520
                return false, nil
1✔
521
        }
1✔
522

523
        // the operator is still draining the node so we reconcile
524
        if utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.Draining) {
1✔
525
                funcLog.Info("the node is still draining")
×
526
                return true, nil
×
527
        }
×
528

529
        // drain is disabled we continue with the configuration
530
        if vars.DisableDrain {
2✔
531
                funcLog.Info("drain is disabled in sriovOperatorConfig")
1✔
532
                return false, nil
1✔
533
        }
1✔
534

535
        // annotate both node and node state with drain or reboot
536
        annotation := consts.DrainRequired
1✔
537
        if reqReboot {
1✔
538
                annotation = consts.RebootRequired
×
539
        }
×
540
        return true, dn.annotate(ctx, desiredNodeState, annotation)
1✔
541
}
542

543
// restartDevicePluginPod restarts the device plugin pod on the specified node.
544
//
545
// The function checks if the pod exists, deletes it if found, and waits for it to be deleted successfully.
546
func (dn *NodeReconciler) restartDevicePluginPod(ctx context.Context) error {
1✔
547
        log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod")
1✔
548
        pods := &corev1.PodList{}
1✔
549
        err := dn.client.List(ctx, pods, &client.ListOptions{
1✔
550
                Namespace: vars.Namespace, Raw: &metav1.ListOptions{
1✔
551
                        LabelSelector:   "app=sriov-device-plugin",
1✔
552
                        FieldSelector:   "spec.nodeName=" + vars.NodeName,
1✔
553
                        ResourceVersion: "0",
1✔
554
                }})
1✔
555
        if err != nil {
1✔
556
                if errors.IsNotFound(err) {
×
557
                        log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
×
558
                        return nil
×
559
                }
×
560
                log.Log.Error(err, "restartDevicePluginPod(): Failed to list device plugin pod, retrying")
×
561
                return err
×
562
        }
563

564
        if len(pods.Items) == 0 {
2✔
565
                log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
1✔
566
                return nil
1✔
567
        }
1✔
568

569
        for _, pod := range pods.Items {
×
570
                log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", pod.Name)
×
571
                err = dn.client.Delete(ctx, &pod)
×
572
                if errors.IsNotFound(err) {
×
573
                        log.Log.Info("restartDevicePluginPod(): pod to delete not found")
×
574
                        continue
×
575
                }
576
                if err != nil {
×
577
                        log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
×
578
                        return err
×
579
                }
×
580

581
                tmpPod := &corev1.Pod{}
×
582
                if err := wait.PollUntilContextCancel(ctx, 3*time.Second, true, func(ctx context.Context) (bool, error) {
×
583
                        err := dn.client.Get(ctx, client.ObjectKeyFromObject(&pod), tmpPod)
×
584
                        if errors.IsNotFound(err) {
×
585
                                log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
×
586
                                return true, nil
×
587
                        }
×
588

589
                        if err != nil {
×
590
                                log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
×
591
                        } else {
×
592
                                log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", pod.Name)
×
593
                        }
×
594
                        return false, nil
×
595
                }); err != nil {
×
596
                        log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
×
597
                        return err
×
598
                }
×
599
        }
600

601
        return nil
×
602
}
603

604
// rebootNode Reboots the node by executing a systemd-run command
605
func (dn *NodeReconciler) rebootNode() error {
×
606
        funcLog := log.Log.WithName("rebootNode")
×
607
        funcLog.Info("trigger node reboot")
×
608
        exit, err := dn.HostHelpers.Chroot(consts.Host)
×
609
        if err != nil {
×
610
                funcLog.Error(err, "chroot command failed")
×
611
                return err
×
612
        }
×
613
        defer exit()
×
614
        // creates a new transient systemd unit to reboot the system.
×
615
        // We explictily try to stop kubelet.service first, before anything else; this
×
616
        // way we ensure the rest of system stays running, because kubelet may need
×
617
        // to do "graceful" shutdown by e.g. de-registering with a load balancer.
×
618
        // However note we use `;` instead of `&&` so we keep rebooting even
×
619
        // if kubelet failed to shutdown - that way the machine will still eventually reboot
×
620
        // as systemd will time out the stop invocation.
×
621
        stdOut, StdErr, err := dn.HostHelpers.RunCommand("systemd-run", "--unit", "sriov-network-config-daemon-reboot",
×
622
                "--description", "sriov-network-config-daemon reboot node", "/bin/sh", "-c", "systemctl stop kubelet.service; reboot")
×
623

×
624
        if err != nil {
×
625
                funcLog.Error(err, "failed to reboot node", "stdOut", stdOut, "StdErr", StdErr)
×
626
                return err
×
627
        }
×
628
        return nil
×
629
}
630

631
// prepareNMUdevRule prepares/validate the status of the config-daemon custom udev rules needed to control
632
// the virtual functions by the operator only.
633
func (dn *NodeReconciler) prepareNMUdevRule() error {
1✔
634
        // we need to remove the Red Hat Virtio network device from the udev rule configuration
1✔
635
        // if we don't remove it when running the config-daemon on a virtual node it will disconnect the node after a reboot
1✔
636
        // even that the operator should not be installed on virtual environments that are not openstack
1✔
637
        // we should not destroy the cluster if the operator is installed there
1✔
638
        supportedVfIds := []string{}
1✔
639
        for _, vfID := range sriovnetworkv1.GetSupportedVfIds() {
1✔
640
                if vfID == "0x1000" || vfID == "0x1041" {
×
641
                        continue
×
642
                }
643
                supportedVfIds = append(supportedVfIds, vfID)
×
644
        }
645

646
        return dn.HostHelpers.PrepareNMUdevRule(supportedVfIds)
1✔
647
}
648

649
// isDrainCompleted returns true if the current-state annotation is drain completed
650
func (dn *NodeReconciler) isDrainCompleted(reqDrain bool, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) bool {
1✔
651
        if vars.DisableDrain {
2✔
652
                return true
1✔
653
        }
1✔
654

655
        // if we need to drain check the drain status
656
        if reqDrain {
2✔
657
                return utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainComplete)
1✔
658
        }
1✔
659

660
        // check in case a reboot was requested and the second run doesn't require a drain
661
        if !utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotation, consts.DrainIdle) {
1✔
662
                return utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainComplete)
×
663
        }
×
664

665
        // if we don't need to drain at all just return true so we can apply the configuration
666
        return true
1✔
667
}
668

669
// annotate annotates the nodeState object with specified annotation.
670
func (dn *NodeReconciler) annotate(
671
        ctx context.Context,
672
        desiredNodeState *sriovnetworkv1.SriovNetworkNodeState,
673
        annotationState string) error {
1✔
674
        funcLog := log.Log.WithName("annotate")
1✔
675

1✔
676
        funcLog.Info(fmt.Sprintf("apply '%s' annotation for node", annotationState))
1✔
677
        if err := utils.AnnotateNode(ctx,
1✔
678
                desiredNodeState.Name,
1✔
679
                consts.NodeDrainAnnotation,
1✔
680
                annotationState, dn.client); err != nil {
2✔
681
                funcLog.Error(err, "Failed to annotate node")
1✔
682
                return err
1✔
683
        }
1✔
684

685
        funcLog.Info(fmt.Sprintf("apply '%s' annotation for nodeState", annotationState))
1✔
686
        if err := utils.AnnotateObject(ctx, desiredNodeState,
1✔
687
                consts.NodeStateDrainAnnotation,
1✔
688
                annotationState, dn.client); err != nil {
1✔
689
                funcLog.Error(err, "Failed to annotate nodeState")
×
690
                return err
×
691
        }
×
692

693
        // the node was annotated we need to wait for the operator to finish the drain
694
        return nil
1✔
695
}
696

697
// SetupWithManager sets up the controller with the Manager.
698
func (dn *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error {
1✔
699
        return ctrl.NewControllerManagedBy(mgr).
1✔
700
                For(&sriovnetworkv1.SriovNetworkNodeState{}).
1✔
701
                WithEventFilter(predicate.Or(predicate.AnnotationChangedPredicate{}, predicate.GenerationChangedPredicate{})).
1✔
702
                WithOptions(controller.Options{MaxConcurrentReconciles: 1}).
1✔
703
                Complete(dn)
1✔
704
}
1✔
705

706
// -------------------------------------
707
// ---- unit tests helper function -----
708
// -------------------------------------
709

710
func (dn *NodeReconciler) GetLastAppliedGeneration() int64 {
1✔
711
        return dn.lastAppliedGeneration
1✔
712
}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc