• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

NVIDIA / skyhook / 23175909332

17 Mar 2026 02:41AM UTC coverage: 80.864%. First build
23175909332

Pull #183

github

web-flow
Merge deabede31 into ed9eb698f
Pull Request #183: fix: batch stickiness — nodes in NodePriority finish all packages before new nodes are picked

50 of 57 new or added lines in 5 files covered. (87.72%)

6909 of 8544 relevant lines covered (80.86%)

3.78 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.93
/operator/cmd/cli/app/reset.go
1
/*
2
 * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
 * SPDX-License-Identifier: Apache-2.0
4
 *
5
 *
6
 * Licensed under the Apache License, Version 2.0 (the "License");
7
 * you may not use this file except in compliance with the License.
8
 * You may obtain a copy of the License at
9
 *
10
 * http://www.apache.org/licenses/LICENSE-2.0
11
 *
12
 * Unless required by applicable law or agreed to in writing, software
13
 * distributed under the License is distributed on an "AS IS" BASIS,
14
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
 * See the License for the specific language governing permissions and
16
 * limitations under the License.
17
 */
18

19
package app
20

21
import (
22
        "bufio"
23
        "context"
24
        "encoding/json"
25
        "fmt"
26
        "strings"
27

28
        "github.com/spf13/cobra"
29
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30

31
        "github.com/NVIDIA/skyhook/operator/api/v1alpha1"
32
        "github.com/NVIDIA/skyhook/operator/internal/cli/client"
33
        cliContext "github.com/NVIDIA/skyhook/operator/internal/cli/context"
34
        "github.com/NVIDIA/skyhook/operator/internal/cli/utils"
35
)
36

37
const (
38
        nodeStateAnnotationPrefix = v1alpha1.METADATA_PREFIX + "/nodeState_"
39
        statusAnnotationPrefix    = v1alpha1.METADATA_PREFIX + "/status_"
40
        cordonAnnotationPrefix    = v1alpha1.METADATA_PREFIX + "/cordon_"
41
        versionAnnotationPrefix   = v1alpha1.METADATA_PREFIX + "/version_"
42
        autoTaintAnnotationPrefix = v1alpha1.METADATA_PREFIX + "/autoTaint_"
43
        statusLabelPrefix         = v1alpha1.METADATA_PREFIX + "/status_"
44
)
45

46
// resetOptions holds the options for the reset command
47
type resetOptions struct {
48
        confirm        bool
49
        skipBatchReset bool
50
}
51

52
// NewResetCmd creates the reset command
53
func NewResetCmd(ctx *cliContext.CLIContext) *cobra.Command {
2✔
54
        opts := &resetOptions{}
2✔
55

2✔
56
        cmd := &cobra.Command{
2✔
57
                Use:   "reset <skyhook-name>",
2✔
58
                Short: "Reset all nodes for a Skyhook",
2✔
59
                Long: `Reset all package state on all nodes for a specific Skyhook, forcing a complete re-run.
2✔
60

2✔
61
This command removes all Skyhook state from all nodes that have state for the
2✔
62
specified Skyhook, causing the operator to re-execute all packages from the beginning.
2✔
63

2✔
64
Unlike 'node reset' which resets specific nodes, 'skyhook reset' resets ALL nodes
2✔
65
that have state for the specified Skyhook.
2✔
66

2✔
67
By default, this command also resets the deployment policy batch state, allowing
2✔
68
the rollout to start fresh from batch 1. Use --skip-batch-reset to preserve the
2✔
69
existing batch state.`,
2✔
70
                Example: `  # Reset all nodes for gpu-init Skyhook
2✔
71
  kubectl skyhook reset gpu-init --confirm
2✔
72

2✔
73
  # Preview changes without applying (dry-run)
2✔
74
  kubectl skyhook reset gpu-init --dry-run
2✔
75

2✔
76
  # Reset nodes only, preserve batch state
2✔
77
  kubectl skyhook reset gpu-init --skip-batch-reset --confirm`,
2✔
78
                Args: cobra.ExactArgs(1),
2✔
79
                RunE: func(cmd *cobra.Command, args []string) error {
3✔
80
                        skyhookName := args[0]
1✔
81

1✔
82
                        clientFactory := client.NewFactory(ctx.GlobalFlags.ConfigFlags)
1✔
83
                        kubeClient, err := clientFactory.Client()
1✔
84
                        if err != nil {
1✔
85
                                return fmt.Errorf("initializing kubernetes client: %w", err)
×
86
                        }
×
87

88
                        return runReset(cmd.Context(), cmd, kubeClient, skyhookName, opts, ctx)
1✔
89
                },
90
        }
91

92
        cmd.Flags().BoolVarP(&opts.confirm, "confirm", "y", false, "Skip confirmation prompt")
2✔
93
        cmd.Flags().BoolVar(&opts.skipBatchReset, "skip-batch-reset", false, "Skip resetting deployment policy batch state")
2✔
94

2✔
95
        return cmd
2✔
96
}
97

98
func runReset(ctx context.Context, cmd *cobra.Command, kubeClient *client.Client, skyhookName string, opts *resetOptions, cliCtx *cliContext.CLIContext) error {
2✔
99
        // Get all nodes
2✔
100
        nodeList, err := kubeClient.Kubernetes().CoreV1().Nodes().List(ctx, metav1.ListOptions{})
2✔
101
        if err != nil {
2✔
102
                return fmt.Errorf("listing nodes: %w", err)
×
103
        }
×
104

105
        // Find nodes that have the specified Skyhook annotation
106
        annotationKey := nodeStateAnnotationPrefix + skyhookName
2✔
107
        nodesToReset := make([]string, 0)
2✔
108
        nodeStates := make(map[string]v1alpha1.NodeState)
2✔
109

2✔
110
        for _, node := range nodeList.Items {
4✔
111
                annotation, ok := node.Annotations[annotationKey]
2✔
112
                if !ok {
4✔
113
                        continue
2✔
114
                }
115

116
                var nodeState v1alpha1.NodeState
2✔
117
                if err := json.Unmarshal([]byte(annotation), &nodeState); err != nil {
3✔
118
                        if cliCtx.GlobalFlags.Verbose {
1✔
119
                                _, _ = fmt.Fprintf(cmd.ErrOrStderr(), "Warning: skipping node %q - invalid annotation: %v\n", node.Name, err)
×
120
                        }
×
121
                        continue
1✔
122
                }
123

124
                nodesToReset = append(nodesToReset, node.Name)
2✔
125
                nodeStates[node.Name] = nodeState
2✔
126
        }
127

128
        if len(nodesToReset) == 0 {
4✔
129
                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "No nodes have state for Skyhook %q\n", skyhookName)
2✔
130
                return nil
2✔
131
        }
2✔
132

133
        // Print summary
134
        _, _ = fmt.Fprintf(cmd.OutOrStdout(), "Skyhook: %s\n", skyhookName)
2✔
135
        _, _ = fmt.Fprintf(cmd.OutOrStdout(), "Nodes to reset (%d):\n", len(nodesToReset))
2✔
136
        for _, nodeName := range nodesToReset {
4✔
137
                nodeState := nodeStates[nodeName]
2✔
138
                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "  - %s (%d packages)\n", nodeName, len(nodeState))
2✔
139
        }
2✔
140

141
        // Show batch reset info
142
        if !opts.skipBatchReset {
4✔
143
                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "\nBatch state will also be reset (use --skip-batch-reset to preserve)\n")
2✔
144
        }
2✔
145

146
        // Dry run check
147
        if cliCtx.GlobalFlags.DryRun {
3✔
148
                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "\n[dry-run] No changes applied\n")
1✔
149
                return nil
1✔
150
        }
1✔
151

152
        // Confirmation
153
        if !opts.confirm {
3✔
154
                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "\nThis will remove ALL package state for Skyhook %q on %d node(s).\n", skyhookName, len(nodesToReset))
1✔
155
                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "All packages will re-run from the beginning.\n")
1✔
156
                if !opts.skipBatchReset {
2✔
157
                        _, _ = fmt.Fprintf(cmd.OutOrStdout(), "Batch state will be reset to start from batch 1.\n")
1✔
158
                }
1✔
159
                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "Continue? [y/N]: ")
1✔
160

1✔
161
                reader := bufio.NewReader(cmd.InOrStdin())
1✔
162
                response, err := reader.ReadString('\n')
1✔
163
                if err != nil {
1✔
164
                        return fmt.Errorf("reading confirmation: %w", err)
×
165
                }
×
166

167
                response = strings.ToLower(strings.TrimSpace(response))
1✔
168
                if response != "y" && response != "yes" {
2✔
169
                        _, _ = fmt.Fprintf(cmd.OutOrStdout(), "Aborted\n")
1✔
170
                        return nil
1✔
171
                }
1✔
172
        }
173

174
        // Apply changes - clear all skyhook-related annotations and labels
175
        successCount, updateErrors := resetNodeAnnotations(ctx, cmd, kubeClient, nodesToReset, skyhookName, cliCtx)
2✔
176

2✔
177
        // Print results
2✔
178
        if len(updateErrors) > 0 {
2✔
179
                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "\nErrors resetting some nodes:\n")
×
180
                for _, e := range updateErrors {
×
181
                        _, _ = fmt.Fprintf(cmd.OutOrStdout(), "  - %s\n", e)
×
182
                }
×
183
        }
184

185
        if successCount > 0 {
4✔
186
                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "\nSuccessfully reset %d node(s) for Skyhook %q\n", successCount, skyhookName)
2✔
187
        }
2✔
188

189
        // Reset batch state unless --skip-batch-reset is set
190
        if !opts.skipBatchReset {
4✔
191
                resetBatchStateForReset(ctx, cmd, kubeClient, skyhookName)
2✔
192
        }
2✔
193

194
        return nil
2✔
195
}
196

197
// resetNodeAnnotations removes all skyhook-related annotations and labels from nodes
198
func resetNodeAnnotations(ctx context.Context, cmd *cobra.Command, kubeClient *client.Client, nodesToReset []string, skyhookName string, cliCtx *cliContext.CLIContext) (int, []string) {
2✔
199
        var updateErrors []string
2✔
200
        successCount := 0
2✔
201

2✔
202
        for _, nodeName := range nodesToReset {
4✔
203
                annotationsToRemove := []string{
2✔
204
                        nodeStateAnnotationPrefix + skyhookName,
2✔
205
                        statusAnnotationPrefix + skyhookName,
2✔
206
                        cordonAnnotationPrefix + skyhookName,
2✔
207
                        versionAnnotationPrefix + skyhookName,
2✔
208
                        autoTaintAnnotationPrefix + skyhookName,
2✔
209
                }
2✔
210
                labelsToRemove := []string{
2✔
211
                        statusLabelPrefix + skyhookName,
2✔
212
                }
2✔
213

2✔
214
                // Try to remove the main nodeState annotation first - this is the critical one
2✔
215
                mainAnnotationKey := nodeStateAnnotationPrefix + skyhookName
2✔
216
                if err := utils.RemoveNodeAnnotation(ctx, kubeClient.Kubernetes(), nodeName, mainAnnotationKey); err != nil {
2✔
217
                        updateErrors = append(updateErrors, fmt.Sprintf("%s: failed to remove nodeState annotation: %v", nodeName, err))
×
218
                        continue
×
219
                }
220

221
                // Remove other annotations (non-critical, so we don't fail if they don't exist)
222
                for _, annKey := range annotationsToRemove {
4✔
223
                        if annKey == mainAnnotationKey {
4✔
224
                                continue // Already removed
2✔
225
                        }
226
                        if err := utils.RemoveNodeAnnotation(ctx, kubeClient.Kubernetes(), nodeName, annKey); err != nil {
2✔
227
                                if cliCtx.GlobalFlags.Verbose {
×
228
                                        _, _ = fmt.Fprintf(cmd.ErrOrStderr(), "Warning: failed to remove annotation %q from node %q: %v\n", annKey, nodeName, err)
×
229
                                }
×
230
                        }
231
                }
232

233
                // Remove labels (non-critical, so we don't fail if they don't exist)
234
                for _, labelKey := range labelsToRemove {
4✔
235
                        if err := utils.RemoveNodeLabel(ctx, kubeClient.Kubernetes(), nodeName, labelKey); err != nil {
2✔
236
                                if cliCtx.GlobalFlags.Verbose {
×
237
                                        _, _ = fmt.Fprintf(cmd.ErrOrStderr(), "Warning: failed to remove label %q from node %q: %v\n", labelKey, nodeName, err)
×
238
                                }
×
239
                        }
240
                }
241

242
                successCount++
2✔
243
        }
244

245
        return successCount, updateErrors
2✔
246
}
247

248
// resetBatchStateForReset resets the batch state for a Skyhook if dynamic client is available
249
func resetBatchStateForReset(ctx context.Context, cmd *cobra.Command, kubeClient *client.Client, skyhookName string) {
2✔
250
        if kubeClient.Dynamic() == nil {
3✔
251
                return
1✔
252
        }
1✔
253

254
        skyhook, err := utils.GetSkyhook(ctx, kubeClient.Dynamic(), skyhookName)
1✔
255
        if err != nil {
1✔
256
                _, _ = fmt.Fprintf(cmd.ErrOrStderr(), "Warning: failed to get skyhook for batch reset: %v\n", err)
×
257
                return
×
258
        }
×
259

260
        if len(skyhook.Status.CompartmentStatuses) == 0 {
1✔
NEW
261
                return
×
NEW
262
        }
×
263

264
        skyhook.ResetCompartmentBatchStates()
1✔
265
        if err := utils.PatchSkyhookStatus(ctx, kubeClient.Dynamic(), skyhookName, skyhook.Status); err != nil {
1✔
NEW
266
                _, _ = fmt.Fprintf(cmd.ErrOrStderr(), "Warning: failed to reset batch state: %v\n", err)
×
NEW
267
                return
×
NEW
268
        }
×
269
        _, _ = fmt.Fprintf(cmd.OutOrStdout(), "Batch state reset for Skyhook %q\n", skyhookName)
1✔
270
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc