• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

k8snetworkplumbingwg / sriov-network-operator / 14726142035

29 Apr 2025 07:51AM UTC coverage: 61.584% (-0.03%) from 61.612%
14726142035

push

github

web-flow
Merge pull request #888 from OguzPastirmaci/dev/update-helm-readme

Update Helm readme with 1.5.0 version

8517 of 13830 relevant lines covered (61.58%)

0.68 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

44.69
/pkg/daemon/daemon.go
1
package daemon
2

3
import (
4
        "context"
5
        "fmt"
6
        "time"
7

8
        corev1 "k8s.io/api/core/v1"
9
        "k8s.io/apimachinery/pkg/api/errors"
10
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
11
        "k8s.io/apimachinery/pkg/util/wait"
12
        ctrl "sigs.k8s.io/controller-runtime"
13
        "sigs.k8s.io/controller-runtime/pkg/client"
14
        "sigs.k8s.io/controller-runtime/pkg/controller"
15
        "sigs.k8s.io/controller-runtime/pkg/log"
16
        "sigs.k8s.io/controller-runtime/pkg/predicate"
17

18
        sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
19
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
20
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
21
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper"
22
        hosttypes "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/host/types"
23
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms"
24
        plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins"
25
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils"
26
        "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
27
)
28

29
// NodeReconciler struct holds various components necessary for reconciling an SR-IOV node.
30
// It includes a Kubernetes client, SR-IOV client, and other utility interfaces.
31
// The struct is designed to manage the lifecycle of an SR-IOV devices on a given node.
32
type NodeReconciler struct {
33
        client client.Client
34

35
        HostHelpers helper.HostHelpersInterface
36

37
        platformHelpers platforms.Interface
38

39
        eventRecorder *EventRecorder
40

41
        featureGate featuregate.FeatureGate
42

43
        // list of disabled plugins
44
        disabledPlugins []string
45

46
        loadedPlugins         map[string]plugin.VendorPlugin
47
        lastAppliedGeneration int64
48
}
49

50
// New creates a new instance of NodeReconciler.
51
func New(
52
        client client.Client,
53
        hostHelpers helper.HostHelpersInterface,
54
        platformHelper platforms.Interface,
55
        er *EventRecorder,
56
        featureGates featuregate.FeatureGate,
57
        disabledPlugins []string,
58
) *NodeReconciler {
1✔
59
        return &NodeReconciler{
1✔
60
                client:          client,
1✔
61
                HostHelpers:     hostHelpers,
1✔
62
                platformHelpers: platformHelper,
1✔
63

1✔
64
                lastAppliedGeneration: 0,
1✔
65
                eventRecorder:         er,
1✔
66
                featureGate:           featureGates,
1✔
67
                disabledPlugins:       disabledPlugins,
1✔
68
        }
1✔
69
}
1✔
70

71
// Init initializes the Sriov Network Operator daemon.
72
// It enables kernel modules, prepare udev rules and load the host network state
73
func (dn *NodeReconciler) Init() error {
1✔
74
        funcLog := log.Log.WithName("Init")
1✔
75
        var err error
1✔
76

1✔
77
        if !vars.UsingSystemdMode {
2✔
78
                funcLog.V(0).Info("daemon running in daemon mode")
1✔
79
                _, err = dn.HostHelpers.CheckRDMAEnabled()
1✔
80
                if err != nil {
1✔
81
                        funcLog.Error(err, "warning, failed to check RDMA state")
×
82
                }
×
83
                dn.HostHelpers.TryEnableTun()
1✔
84
                dn.HostHelpers.TryEnableVhostNet()
1✔
85
                err = dn.HostHelpers.CleanSriovFilesFromHost(vars.ClusterType == consts.ClusterTypeOpenshift)
1✔
86
                if err != nil {
1✔
87
                        funcLog.Error(err, "failed to remove all the systemd sriov files")
×
88
                }
×
89
        } else {
×
90
                funcLog.V(0).Info("Run(): daemon running in systemd mode")
×
91
        }
×
92

93
        if err := dn.prepareNMUdevRule(); err != nil {
1✔
94
                funcLog.Error(err, "failed to prepare udev files to disable network manager on requested VFs")
×
95
        }
×
96
        if err := dn.HostHelpers.PrepareVFRepUdevRule(); err != nil {
1✔
97
                funcLog.Error(err, "failed to prepare udev files to rename VF representors for requested VFs")
×
98
        }
×
99

100
        // init openstack info
101
        if vars.PlatformType == consts.VirtualOpenStack {
1✔
102
                ns, err := dn.HostHelpers.GetCheckPointNodeState()
×
103
                if err != nil {
×
104
                        return err
×
105
                }
×
106

107
                if ns == nil {
×
108
                        err = dn.platformHelpers.CreateOpenstackDevicesInfo()
×
109
                        if err != nil {
×
110
                                return err
×
111
                        }
×
112
                } else {
×
113
                        dn.platformHelpers.CreateOpenstackDevicesInfoFromNodeStatus(ns)
×
114
                }
×
115
        }
116

117
        // get interfaces
118
        ns := &sriovnetworkv1.SriovNetworkNodeState{}
1✔
119
        err = dn.updateStatusFromHost(ns)
1✔
120
        if err != nil {
1✔
121
                funcLog.Error(err, "failed to get host network status on init")
×
122
                return err
×
123
        }
×
124

125
        // init vendor plugins
126
        dn.loadedPlugins, err = loadPlugins(ns, dn.HostHelpers, dn.disabledPlugins)
1✔
127
        if err != nil {
1✔
128
                funcLog.Error(err, "failed to load vendor plugins")
×
129
                return err
×
130
        }
×
131

132
        // save init state
133
        err = dn.HostHelpers.WriteCheckpointFile(ns)
1✔
134
        if err != nil {
1✔
135
                funcLog.Error(err, "failed to write checkpoint file on host")
×
136
        }
×
137
        return err
1✔
138
}
139

140
// Reconcile Reconciles the nodeState object by performing the following steps:
141
// 1. Retrieves the latest NodeState from the API server.
142
// 2. Checks if the object has the required drain controller annotations for the current generation.
143
// 3. Updates the nodeState Status object with the existing network state (interfaces, bridges, and RDMA status).
144
// 4. If running in systemd mode, checks the sriov result from the config-daemon that runs in systemd.
145
// 5. Compares the latest generation with the last applied generation to determine if a refresh on NICs is needed.
146
// 6. Checks for drift between the host state and the nodeState status.
147
// 7. Updates the sync state of the nodeState object as per the current requirements.
148
// 8. Determines if a drain is required based on the current state of the nodeState.
149
// 9. Handles the drain if necessary, ensuring that it does not conflict with other drain requests.
150
// 10. Applies the changes to the nodeState if there are no issues and updates the sync status accordingly.
151
// 11. If a reboot is required after applying the changes, returns a result to trigger a reboot.
152
//
153
// Returns a Result indicating whether or not the controller should requeue the request for further processing.
154
func (dn *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
1✔
155
        reqLogger := log.FromContext(ctx).WithName("Reconcile")
1✔
156
        // Get the latest NodeState
1✔
157
        desiredNodeState := &sriovnetworkv1.SriovNetworkNodeState{}
1✔
158
        err := dn.client.Get(ctx, client.ObjectKey{Namespace: req.Namespace, Name: req.Name}, desiredNodeState)
1✔
159
        if err != nil {
1✔
160
                if errors.IsNotFound(err) {
×
161
                        reqLogger.Info("NodeState doesn't exist")
×
162
                        return ctrl.Result{}, nil
×
163
                }
×
164
                reqLogger.Error(err, "Failed to fetch node state", "name", vars.NodeName)
×
165
                return ctrl.Result{}, err
×
166
        }
167

168
        // Check the object as the drain controller annotations
169
        // if not just wait for the drain controller to add them before we start taking care of the nodeState
170
        if !utils.ObjectHasAnnotationKey(desiredNodeState, consts.NodeStateDrainAnnotationCurrent) ||
1✔
171
                !utils.ObjectHasAnnotationKey(desiredNodeState, consts.NodeStateDrainAnnotation) {
1✔
172
                reqLogger.V(2).Info("NodeState doesn't have the current drain annotation")
×
173
                return ctrl.Result{}, nil
×
174
        }
×
175

176
        latest := desiredNodeState.GetGeneration()
1✔
177
        current := desiredNodeState.DeepCopy()
1✔
178
        reqLogger.V(0).Info("new generation", "generation", latest)
1✔
179

1✔
180
        // Update the nodeState Status object with the existing network state (interfaces bridges and rdma status)
1✔
181
        err = dn.updateStatusFromHost(desiredNodeState)
1✔
182
        if err != nil {
1✔
183
                reqLogger.Error(err, "failed to get host network status")
×
184
                return ctrl.Result{}, err
×
185
        }
×
186

187
        // if we are running in systemd mode we want to get the sriov result from the config-daemon that runs in systemd
188
        sriovResult, sriovResultExists, err := dn.checkSystemdStatus()
1✔
189
        //TODO: in the case we need to think what to do if we try to apply again or not
1✔
190
        if err != nil {
1✔
191
                reqLogger.Error(err, "failed to check systemd status unexpected error")
×
192
                err = dn.updateSyncState(ctx, desiredNodeState, consts.SyncStatusFailed, "unexpected error")
×
193
                if err != nil {
×
194
                        reqLogger.Error(err, "failed to update nodeState status")
×
195
                        return ctrl.Result{}, err
×
196
                }
×
197

198
                return ctrl.Result{}, nil
×
199
        }
200

201
        // if we are on the latest generation make a refresh on the nics
202
        if dn.lastAppliedGeneration == latest {
2✔
203
                isDrifted, err := dn.checkHostStateDrift(ctx, desiredNodeState)
1✔
204
                if err != nil {
1✔
205
                        reqLogger.Error(err, "failed to refresh host state")
×
206
                        return ctrl.Result{}, err
×
207
                }
×
208

209
                // if there are no host state drift changes, and we are on the latest applied policy
210
                // we check if we need to publish a new nodeState status if not we requeue
211
                if !isDrifted {
2✔
212
                        shouldUpdate := dn.shouldUpdateStatus(current, desiredNodeState)
1✔
213
                        if shouldUpdate {
1✔
214
                                reqLogger.Info("updating nodeState with new host status")
×
215
                                err = dn.updateSyncState(ctx, desiredNodeState, desiredNodeState.Status.SyncStatus, desiredNodeState.Status.LastSyncError)
×
216
                                if err != nil {
×
217
                                        reqLogger.Error(err, "failed to update nodeState new host status")
×
218
                                        return ctrl.Result{}, err
×
219
                                }
×
220
                        }
221

222
                        return ctrl.Result{RequeueAfter: consts.DaemonRequeueTime}, nil
1✔
223
                }
224
        }
225

226
        // set sync state to inProgress, but we don't clear the failed status
227
        err = dn.updateSyncState(ctx, desiredNodeState, consts.SyncStatusInProgress, desiredNodeState.Status.LastSyncError)
1✔
228
        if err != nil {
1✔
229
                reqLogger.Error(err, "failed to update sync status to inProgress")
×
230
                return ctrl.Result{}, err
×
231
        }
×
232

233
        reqReboot, reqDrain, err := dn.checkOnNodeStateChange(desiredNodeState)
1✔
234
        if err != nil {
1✔
235
                return ctrl.Result{}, err
×
236
        }
×
237

238
        if vars.UsingSystemdMode {
1✔
239
                // When running using systemd check if the applied configuration is the latest one
×
240
                // or there is a new config we need to apply
×
241
                // When using systemd configuration we write the file
×
242
                systemdConfModified, err := dn.writeSystemdConfigFile(desiredNodeState)
×
243
                if err != nil {
×
244
                        reqLogger.Error(err, "failed to write systemd config file")
×
245
                        return ctrl.Result{}, err
×
246
                }
×
247
                reqDrain = reqDrain || systemdConfModified || !sriovResultExists
×
248
                // require reboot if drain needed for systemd mode
×
249
                reqReboot = reqReboot || reqDrain
×
250
        }
251

252
        reqLogger.V(0).Info("aggregated daemon node state requirement",
1✔
253
                "drain-required", reqDrain, "reboot-required", reqReboot, "disable-drain", vars.DisableDrain)
1✔
254

1✔
255
        // handle drain only if the plugins request drain, or we are already in a draining request state
1✔
256
        if reqDrain ||
1✔
257
                !utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainIdle) {
2✔
258
                drainInProcess, err := dn.handleDrain(ctx, desiredNodeState, reqReboot)
1✔
259
                if err != nil {
1✔
260
                        reqLogger.Error(err, "failed to handle drain")
×
261
                        return ctrl.Result{}, err
×
262
                }
×
263
                // drain is still in progress we don't need to re-queue the request as the operator will update the annotation
264
                if drainInProcess {
2✔
265
                        return ctrl.Result{}, nil
1✔
266
                }
1✔
267
        }
268

269
        // if we finish the drain we should run apply here
270
        if dn.isDrainCompleted(reqDrain, desiredNodeState) {
2✔
271
                return dn.apply(ctx, desiredNodeState, reqReboot, sriovResult)
1✔
272
        }
1✔
273

274
        return ctrl.Result{}, nil
×
275
}
276

277
// checkOnNodeStateChange checks the state change required for the node based on the desired SriovNetworkNodeState.
278
// The function iterates over all loaded plugins and calls their OnNodeStateChange method with the desired state.
279
// It returns two boolean values indicating whether a reboot or drain operation is required.
280
func (dn *NodeReconciler) checkOnNodeStateChange(desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, bool, error) {
1✔
281
        funcLog := log.Log.WithName("checkOnNodeStateChange")
1✔
282
        reqReboot := false
1✔
283
        reqDrain := false
1✔
284

1✔
285
        // check if any of the plugins required to drain or reboot the node
1✔
286
        for k, p := range dn.loadedPlugins {
2✔
287
                d, r, err := p.OnNodeStateChange(desiredNodeState)
1✔
288
                if err != nil {
1✔
289
                        funcLog.Error(err, "OnNodeStateChange plugin error", "plugin-name", k)
×
290
                        return false, false, err
×
291
                }
×
292
                funcLog.V(0).Info("OnNodeStateChange result",
1✔
293
                        "plugin", k,
1✔
294
                        "drain-required", d,
1✔
295
                        "reboot-required", r)
1✔
296
                reqDrain = reqDrain || d
1✔
297
                reqReboot = reqReboot || r
1✔
298
        }
299

300
        return reqReboot, reqDrain, nil
1✔
301
}
302

303
// checkSystemdStatus Checks the status of systemd services on the host node.
304
// return the sriovResult struct a boolean if the result file exist on the node
305
func (dn *NodeReconciler) checkSystemdStatus() (*hosttypes.SriovResult, bool, error) {
1✔
306
        if !vars.UsingSystemdMode {
2✔
307
                return nil, false, nil
1✔
308
        }
1✔
309

310
        funcLog := log.Log.WithName("checkSystemdStatus")
×
311
        serviceEnabled, err := dn.HostHelpers.IsServiceEnabled(consts.SriovServicePath)
×
312
        if err != nil {
×
313
                funcLog.Error(err, "failed to check if sriov-config service exist on host")
×
314
                return nil, false, err
×
315
        }
×
316
        postNetworkServiceEnabled, err := dn.HostHelpers.IsServiceEnabled(consts.SriovPostNetworkServicePath)
×
317
        if err != nil {
×
318
                funcLog.Error(err, "failed to check if sriov-config-post-network service exist on host")
×
319
                return nil, false, err
×
320
        }
×
321

322
        // if the service doesn't exist we should continue to let the k8s plugin to create the service files
323
        // this is only for k8s base environments, for openshift the sriov-operator creates a machine config to will apply
324
        // the system service and reboot the node the config-daemon doesn't need to do anything.
325
        sriovResult := &hosttypes.SriovResult{SyncStatus: consts.SyncStatusFailed,
×
326
                LastSyncError: fmt.Sprintf("some sriov systemd services are not available on node: "+
×
327
                        "sriov-config available:%t, sriov-config-post-network available:%t", serviceEnabled, postNetworkServiceEnabled)}
×
328
        exist := false
×
329

×
330
        // check if the service exist
×
331
        if serviceEnabled && postNetworkServiceEnabled {
×
332
                exist = true
×
333
                sriovResult, err = dn.HostHelpers.ReadSriovResult()
×
334
                if err != nil {
×
335
                        funcLog.Error(err, "failed to load sriov result file from host")
×
336
                        return nil, false, err
×
337
                }
×
338
        }
339
        return sriovResult, exist, nil
×
340
}
341

342
// apply applies the desired state of the node by:
343
// 1. Applying vendor plugins that have been loaded.
344
// 2. Depending on whether a reboot is required or if the configuration is being done via systemd, it applies the generic or virtual plugin(s).
345
// 3. Rebooting the node if necessary and sending an event.
346
// 4. Restarting the device plugin pod on the node.
347
// 5. Requesting annotation updates for draining the idle state of the node.
348
// 6. Synchronizing with the host network status and updating the sync status of the node in the nodeState object.
349
// 7. Updating the lastAppliedGeneration to the current generation.
350
func (dn *NodeReconciler) apply(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState, reqReboot bool, sriovResult *hosttypes.SriovResult) (ctrl.Result, error) {
1✔
351
        reqLogger := log.FromContext(ctx).WithName("Apply")
1✔
352
        // apply the vendor plugins after we are done with drain if needed
1✔
353
        for k, p := range dn.loadedPlugins {
2✔
354
                // Skip both the general and virtual plugin apply them last
1✔
355
                if k != GenericPluginName && k != VirtualPluginName {
2✔
356
                        err := p.Apply()
1✔
357
                        if err != nil {
1✔
358
                                reqLogger.Error(err, "plugin Apply failed", "plugin-name", k)
×
359
                                return ctrl.Result{}, err
×
360
                        }
×
361
                }
362
        }
363

364
        // if we don't need to reboot, or we are not doing the configuration in systemd
365
        // we apply the generic plugin
366
        if !reqReboot && !vars.UsingSystemdMode {
2✔
367
                // For BareMetal machines apply the generic plugin
1✔
368
                selectedPlugin, ok := dn.loadedPlugins[GenericPluginName]
1✔
369
                if ok {
2✔
370
                        // Apply generic plugin last
1✔
371
                        err := selectedPlugin.Apply()
1✔
372
                        if err != nil {
1✔
373
                                reqLogger.Error(err, "generic plugin fail to apply")
×
374
                                return ctrl.Result{}, err
×
375
                        }
×
376
                }
377

378
                // For Virtual machines apply the virtual plugin
379
                selectedPlugin, ok = dn.loadedPlugins[VirtualPluginName]
1✔
380
                if ok {
1✔
381
                        // Apply virtual plugin last
×
382
                        err := selectedPlugin.Apply()
×
383
                        if err != nil {
×
384
                                reqLogger.Error(err, "virtual plugin failed to apply")
×
385
                                return ctrl.Result{}, err
×
386
                        }
×
387
                }
388
        }
389

390
        if reqReboot {
1✔
391
                reqLogger.Info("reboot node")
×
392
                dn.eventRecorder.SendEvent(ctx, "RebootNode", "Reboot node has been initiated")
×
393
                return ctrl.Result{}, dn.rebootNode()
×
394
        }
×
395

396
        if err := dn.restartDevicePluginPod(ctx); err != nil {
1✔
397
                reqLogger.Error(err, "failed to restart device plugin on the node")
×
398
                return ctrl.Result{}, err
×
399
        }
×
400

401
        err := dn.annotate(ctx, desiredNodeState, consts.DrainIdle)
1✔
402
        if err != nil {
1✔
403
                reqLogger.Error(err, "failed to request annotation update to idle")
×
404
                return ctrl.Result{}, err
×
405
        }
×
406

407
        reqLogger.Info("sync succeeded")
1✔
408
        syncStatus := consts.SyncStatusSucceeded
1✔
409
        lastSyncError := ""
1✔
410
        if vars.UsingSystemdMode {
1✔
411
                syncStatus = sriovResult.SyncStatus
×
412
                lastSyncError = sriovResult.LastSyncError
×
413
        }
×
414

415
        // Update the nodeState Status object with the existing network interfaces
416
        err = dn.updateStatusFromHost(desiredNodeState)
1✔
417
        if err != nil {
1✔
418
                reqLogger.Error(err, "failed to get host network status")
×
419
                return ctrl.Result{}, err
×
420
        }
×
421

422
        err = dn.updateSyncState(ctx, desiredNodeState, syncStatus, lastSyncError)
1✔
423
        if err != nil {
1✔
424
                reqLogger.Error(err, "failed to update sync status")
×
425
                return ctrl.Result{}, err
×
426
        }
×
427

428
        // update the lastAppliedGeneration
429
        dn.lastAppliedGeneration = desiredNodeState.Generation
1✔
430
        return ctrl.Result{RequeueAfter: consts.DaemonRequeueTime}, nil
1✔
431
}
432

433
// checkHostStateDrift returns true if the node state drifted from the nodeState policy
434
// Check if there is a change in the host network interfaces that require a reconfiguration by the daemon
435
func (dn *NodeReconciler) checkHostStateDrift(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
1✔
436
        funcLog := log.Log.WithName("checkHostStateDrift()")
1✔
437

1✔
438
        // Skip when SriovNetworkNodeState object has just been created.
1✔
439
        if desiredNodeState.GetGeneration() == 1 && len(desiredNodeState.Spec.Interfaces) == 0 {
1✔
440
                err := dn.HostHelpers.ClearPCIAddressFolder()
×
441
                if err != nil {
×
442
                        funcLog.Error(err, "failed to clear the PCI address configuration")
×
443
                        return false, err
×
444
                }
×
445

446
                funcLog.V(0).Info("interface policy spec not yet set by controller for sriovNetworkNodeState",
×
447
                        "name", desiredNodeState.Name)
×
448
                if desiredNodeState.Status.SyncStatus != consts.SyncStatusSucceeded ||
×
449
                        desiredNodeState.Status.LastSyncError != "" {
×
450
                        err = dn.updateSyncState(ctx, desiredNodeState, consts.SyncStatusSucceeded, "")
×
451
                }
×
452
                return false, err
×
453
        }
454

455
        // Verify changes in the status of the SriovNetworkNodeState CR.
456
        log.Log.V(0).Info("verifying interfaces status change")
1✔
457
        for _, p := range dn.loadedPlugins {
2✔
458
                // Verify changes in the status of the SriovNetworkNodeState CR.
1✔
459
                log.Log.V(2).Info("verifying status change for plugin", "pluginName", p.Name())
1✔
460
                changed, err := p.CheckStatusChanges(desiredNodeState)
1✔
461
                if err != nil {
1✔
462
                        return false, err
×
463
                }
×
464
                if changed {
1✔
465
                        log.Log.V(0).Info("plugin require change", "pluginName", p.Name())
×
466
                        return true, nil
×
467
                }
×
468
        }
469

470
        log.Log.V(0).Info("Interfaces not changed")
1✔
471
        return false, nil
1✔
472
}
473

474
// writeSystemdConfigFile Writes the systemd configuration file for the node
475
// and handles any necessary actions such as removing an existing result file and writing supported NIC IDs.
476
//
477
//        The function first attempts to write the systemd configuration file based on the desired node state.
478
//        If successful, it checks if the configuration file was modified. If so, it removes the existing result file (if present) to ensure that outdated results are not used.
479
//        After writing the configuration file and potentially removing the old one, it writes a file containing supported NIC IDs.
480
func (dn *NodeReconciler) writeSystemdConfigFile(desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
×
481
        funcLog := log.Log.WithName("writeSystemdConfigFile()")
×
482
        funcLog.V(0).Info("writing systemd config file to host")
×
483
        systemdConfModified, err := dn.HostHelpers.WriteConfFile(desiredNodeState)
×
484
        if err != nil {
×
485
                funcLog.Error(err, "failed to write configuration file for systemd mode")
×
486
                return false, err
×
487
        }
×
488
        if systemdConfModified {
×
489
                // remove existing result file to make sure that we will not use outdated result, e.g. in case if
×
490
                // systemd service was not triggered for some reason
×
491
                err = dn.HostHelpers.RemoveSriovResult()
×
492
                if err != nil {
×
493
                        funcLog.Error(err, "failed to remove result file for systemd mode")
×
494
                        return false, err
×
495
                }
×
496
        }
497

498
        err = dn.HostHelpers.WriteSriovSupportedNics()
×
499
        if err != nil {
×
500
                funcLog.Error(err, "failed to write supported nic ids file for systemd mode")
×
501
                return false, err
×
502
        }
×
503

504
        funcLog.V(0).Info("systemd mode WriteConfFile results",
×
505
                "drain-required", systemdConfModified, "reboot-required", systemdConfModified)
×
506
        return systemdConfModified, nil
×
507
}
508

509
// handleDrain: adds the right annotation to the node and nodeState object
510
// returns true if we need to finish the reconcile loop and wait for a new object
511
func (dn *NodeReconciler) handleDrain(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState, reqReboot bool) (bool, error) {
1✔
512
        funcLog := log.Log.WithName("handleDrain")
1✔
513
        // done with the drain we can continue with the configuration
1✔
514
        if utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainComplete) {
2✔
515
                funcLog.Info("the node complete the draining")
1✔
516
                return false, nil
1✔
517
        }
1✔
518

519
        // the operator is still draining the node so we reconcile
520
        if utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.Draining) {
1✔
521
                funcLog.Info("the node is still draining")
×
522
                return true, nil
×
523
        }
×
524

525
        // drain is disabled we continue with the configuration
526
        if vars.DisableDrain {
1✔
527
                funcLog.Info("drain is disabled in sriovOperatorConfig")
×
528
                return false, nil
×
529
        }
×
530

531
        // annotate both node and node state with drain or reboot
532
        annotation := consts.DrainRequired
1✔
533
        if reqReboot {
1✔
534
                annotation = consts.RebootRequired
×
535
        }
×
536
        return true, dn.annotate(ctx, desiredNodeState, annotation)
1✔
537
}
538

539
// restartDevicePluginPod restarts the device plugin pod on the specified node.
540
//
541
// The function checks if the pod exists, deletes it if found, and waits for it to be deleted successfully.
542
func (dn *NodeReconciler) restartDevicePluginPod(ctx context.Context) error {
1✔
543
        log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod")
1✔
544
        pods := &corev1.PodList{}
1✔
545
        err := dn.client.List(ctx, pods, &client.ListOptions{
1✔
546
                Namespace: vars.Namespace, Raw: &metav1.ListOptions{
1✔
547
                        LabelSelector:   "app=sriov-device-plugin",
1✔
548
                        FieldSelector:   "spec.nodeName=" + vars.NodeName,
1✔
549
                        ResourceVersion: "0",
1✔
550
                }})
1✔
551
        if err != nil {
1✔
552
                if errors.IsNotFound(err) {
×
553
                        log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
×
554
                        return nil
×
555
                }
×
556
                log.Log.Error(err, "restartDevicePluginPod(): Failed to list device plugin pod, retrying")
×
557
                return err
×
558
        }
559

560
        if len(pods.Items) == 0 {
2✔
561
                log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
1✔
562
                return nil
1✔
563
        }
1✔
564

565
        for _, pod := range pods.Items {
×
566
                log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", pod.Name)
×
567
                err = dn.client.Delete(ctx, &pod)
×
568
                if errors.IsNotFound(err) {
×
569
                        log.Log.Info("restartDevicePluginPod(): pod to delete not found")
×
570
                        continue
×
571
                }
572
                if err != nil {
×
573
                        log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
×
574
                        return err
×
575
                }
×
576

577
                tmpPod := &corev1.Pod{}
×
578
                if err := wait.PollUntilContextCancel(ctx, 3*time.Second, true, func(ctx context.Context) (bool, error) {
×
579
                        err := dn.client.Get(ctx, client.ObjectKeyFromObject(&pod), tmpPod)
×
580
                        if errors.IsNotFound(err) {
×
581
                                log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
×
582
                                return true, nil
×
583
                        }
×
584

585
                        if err != nil {
×
586
                                log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
×
587
                        } else {
×
588
                                log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", pod.Name)
×
589
                        }
×
590
                        return false, nil
×
591
                }); err != nil {
×
592
                        log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
×
593
                        return err
×
594
                }
×
595
        }
596

597
        return nil
×
598
}
599

600
// rebootNode Reboots the node by executing a systemd-run command
601
func (dn *NodeReconciler) rebootNode() error {
×
602
        funcLog := log.Log.WithName("rebootNode")
×
603
        funcLog.Info("trigger node reboot")
×
604
        exit, err := dn.HostHelpers.Chroot(consts.Host)
×
605
        if err != nil {
×
606
                funcLog.Error(err, "chroot command failed")
×
607
                return err
×
608
        }
×
609
        defer exit()
×
610
        // creates a new transient systemd unit to reboot the system.
×
611
        // We explictily try to stop kubelet.service first, before anything else; this
×
612
        // way we ensure the rest of system stays running, because kubelet may need
×
613
        // to do "graceful" shutdown by e.g. de-registering with a load balancer.
×
614
        // However note we use `;` instead of `&&` so we keep rebooting even
×
615
        // if kubelet failed to shutdown - that way the machine will still eventually reboot
×
616
        // as systemd will time out the stop invocation.
×
617
        stdOut, StdErr, err := dn.HostHelpers.RunCommand("systemd-run", "--unit", "sriov-network-config-daemon-reboot",
×
618
                "--description", "sriov-network-config-daemon reboot node", "/bin/sh", "-c", "systemctl stop kubelet.service; reboot")
×
619

×
620
        if err != nil {
×
621
                funcLog.Error(err, "failed to reboot node", "stdOut", stdOut, "StdErr", StdErr)
×
622
                return err
×
623
        }
×
624
        return nil
×
625
}
626

627
// prepareNMUdevRule prepares/validate the status of the config-daemon custom udev rules needed to control
628
// the virtual functions by the operator only.
629
func (dn *NodeReconciler) prepareNMUdevRule() error {
1✔
630
        // we need to remove the Red Hat Virtio network device from the udev rule configuration
1✔
631
        // if we don't remove it when running the config-daemon on a virtual node it will disconnect the node after a reboot
1✔
632
        // even that the operator should not be installed on virtual environments that are not openstack
1✔
633
        // we should not destroy the cluster if the operator is installed there
1✔
634
        supportedVfIds := []string{}
1✔
635
        for _, vfID := range sriovnetworkv1.GetSupportedVfIds() {
1✔
636
                if vfID == "0x1000" || vfID == "0x1041" {
×
637
                        continue
×
638
                }
639
                supportedVfIds = append(supportedVfIds, vfID)
×
640
        }
641

642
        return dn.HostHelpers.PrepareNMUdevRule(supportedVfIds)
1✔
643
}
644

645
// isDrainCompleted returns true if the current-state annotation is drain completed
646
func (dn *NodeReconciler) isDrainCompleted(reqDrain bool, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) bool {
1✔
647
        // if we need to drain check the drain status
1✔
648
        if reqDrain {
2✔
649
                return utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainComplete)
1✔
650
        }
1✔
651

652
        // check in case a reboot was requested and the second run doesn't require a drain
653
        if !utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotation, consts.DrainIdle) {
1✔
654
                return utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainComplete)
×
655
        }
×
656

657
        // if we don't need to drain at all just return true so we can apply the configuration
658
        return true
1✔
659
}
660

661
// annotate annotates the nodeState object with specified annotation.
662
func (dn *NodeReconciler) annotate(
663
        ctx context.Context,
664
        desiredNodeState *sriovnetworkv1.SriovNetworkNodeState,
665
        annotationState string) error {
1✔
666
        funcLog := log.Log.WithName("annotate")
1✔
667

1✔
668
        funcLog.Info(fmt.Sprintf("apply '%s' annotation for node", annotationState))
1✔
669
        err := utils.AnnotateNode(ctx, desiredNodeState.Name, consts.NodeDrainAnnotation, annotationState, dn.client)
1✔
670
        if err != nil {
1✔
671
                log.Log.Error(err, "Failed to annotate node")
×
672
                return err
×
673
        }
×
674

675
        funcLog.Info(fmt.Sprintf("apply '%s' annotation for nodeState", annotationState))
1✔
676
        if err := utils.AnnotateObject(context.Background(), desiredNodeState,
1✔
677
                consts.NodeStateDrainAnnotation,
1✔
678
                annotationState, dn.client); err != nil {
1✔
679
                return err
×
680
        }
×
681

682
        // the node was annotated we need to wait for the operator to finish the drain
683
        return nil
1✔
684
}
685

686
// SetupWithManager sets up the controller with the Manager.
687
func (dn *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error {
1✔
688
        return ctrl.NewControllerManagedBy(mgr).
1✔
689
                For(&sriovnetworkv1.SriovNetworkNodeState{}).
1✔
690
                WithEventFilter(predicate.Or(predicate.AnnotationChangedPredicate{}, predicate.GenerationChangedPredicate{})).
1✔
691
                WithOptions(controller.Options{MaxConcurrentReconciles: 1}).
1✔
692
                Complete(dn)
1✔
693
}
1✔
694

695
// -------------------------------------
696
// ---- unit tests helper function -----
697
// -------------------------------------
698

699
func (dn *NodeReconciler) GetLastAppliedGeneration() int64 {
1✔
700
        return dn.lastAppliedGeneration
1✔
701
}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc