• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

NVIDIA / skyhook / 20320280516

17 Dec 2025 11:15PM UTC coverage: 75.452% (+0.5%) from 74.903%
20320280516

push

github

web-flow
feat(cli): add package and node management commands with lifecycle controls (#123)

Add comprehensive CLI commands for managing Skyhook packages and nodes:

Package Commands:
- `package rerun`: Force re-execution of packages on specific nodes
  - Support for stage-specific re-runs (apply, config, interrupt, post-interrupt)
  - Node matching via exact names or regex patterns
- `package status`: Query package status across the cluster
- `package logs`: Retrieve package execution logs with follow/tail support

Node Commands:
- `node list`: List all nodes with Skyhook status
- `node status`: Display detailed status for specific nodes
- `node ignore`: Add/remove ignore label to pause operations on nodes
- `node reset`: Reset node state for a Skyhook

Lifecycle Commands:
- `pause`: Pause Skyhook reconciliation temporarily
- `resume`: Resume paused Skyhook operations
- `disable`: Disable a Skyhook completely
- `enable`: Re-enable a disabled Skyhook

Also includes:
- Comprehensive unit tests with K8s dynamic client mocks
- CLI e2e test suite using chainsaw (lifecycle, node, package tests)
- CI integration for CLI tests in operator-ci workflow
- Shared utilities for node matching, label management, and patch-based updates

1142 of 1535 new or added lines in 16 files covered. (74.4%)

2 existing lines in 1 file now uncovered.

5803 of 7691 relevant lines covered (75.45%)

1.11 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.26
/operator/internal/cli/node/node_ignore.go
1
/*
2
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
 * SPDX-License-Identifier: Apache-2.0
4
 *
5
 *
6
 * Licensed under the Apache License, Version 2.0 (the "License");
7
 * you may not use this file except in compliance with the License.
8
 * You may obtain a copy of the License at
9
 *
10
 * http://www.apache.org/licenses/LICENSE-2.0
11
 *
12
 * Unless required by applicable law or agreed to in writing, software
13
 * distributed under the License is distributed on an "AS IS" BASIS,
14
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
 * See the License for the specific language governing permissions and
16
 * limitations under the License.
17
 */
18

19
package node
20

21
import (
22
        "context"
23
        "fmt"
24

25
        "github.com/spf13/cobra"
26
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27

28
        "github.com/NVIDIA/skyhook/operator/internal/cli/client"
29
        cliContext "github.com/NVIDIA/skyhook/operator/internal/cli/context"
30
        "github.com/NVIDIA/skyhook/operator/internal/cli/utils"
31
)
32

33
const labelValueTrue = "true"
34

35
// NewIgnoreCmd creates the node ignore command
36
func NewIgnoreCmd(ctx *cliContext.CLIContext) *cobra.Command {
1✔
37
        cmd := &cobra.Command{
1✔
38
                Use:   "ignore <node-name...>",
1✔
39
                Short: "Ignore node(s) from Skyhook processing",
1✔
40
                Long: `Ignore node(s) from all Skyhook processing by setting the ignore label.
1✔
41

1✔
42
When a node is ignored, Skyhook will skip it during package execution.
1✔
43
This is useful for maintenance windows or debugging.
1✔
44

1✔
45
Node names can be exact matches or regex patterns.`,
1✔
46
                Example: `  # Ignore a single node
1✔
47
  kubectl skyhook node ignore worker-1
1✔
48

1✔
49
  # Ignore multiple nodes
1✔
50
  kubectl skyhook node ignore worker-1 worker-2 worker-3
1✔
51

1✔
52
  # Ignore all nodes matching a pattern
1✔
53
  kubectl skyhook node ignore "worker-.*"
1✔
54

1✔
55
  # Ignore GPU nodes for maintenance
1✔
56
  kubectl skyhook node ignore "gpu-node-[0-9]+"`,
1✔
57
                Args: cobra.MinimumNArgs(1),
1✔
58
                RunE: func(cmd *cobra.Command, args []string) error {
1✔
NEW
59
                        clientFactory := client.NewFactory(ctx.GlobalFlags.ConfigFlags)
×
NEW
60
                        kubeClient, err := clientFactory.Client()
×
NEW
61
                        if err != nil {
×
NEW
62
                                return fmt.Errorf("initializing kubernetes client: %w", err)
×
NEW
63
                        }
×
64

NEW
65
                        return runIgnore(cmd.Context(), cmd, kubeClient, args, ctx, true)
×
66
                },
67
        }
68

69
        return cmd
1✔
70
}
71

72
// NewUnignoreCmd creates the node unignore command
73
func NewUnignoreCmd(ctx *cliContext.CLIContext) *cobra.Command {
1✔
74
        cmd := &cobra.Command{
1✔
75
                Use:   "unignore <node-name...>",
1✔
76
                Short: "Remove ignore label from node(s)",
1✔
77
                Long: `Remove the ignore label from node(s), re-enabling Skyhook processing.
1✔
78

1✔
79
After unignoring, Skyhook will resume package execution on these nodes.
1✔
80

1✔
81
Node names can be exact matches or regex patterns.`,
1✔
82
                Example: `  # Unignore a single node
1✔
83
  kubectl skyhook node unignore worker-1
1✔
84

1✔
85
  # Unignore multiple nodes
1✔
86
  kubectl skyhook node unignore worker-1 worker-2 worker-3
1✔
87

1✔
88
  # Unignore all nodes matching a pattern
1✔
89
  kubectl skyhook node unignore "gpu-node-[0-9]+"`,
1✔
90
                Args: cobra.MinimumNArgs(1),
1✔
91
                RunE: func(cmd *cobra.Command, args []string) error {
1✔
NEW
92
                        clientFactory := client.NewFactory(ctx.GlobalFlags.ConfigFlags)
×
NEW
93
                        kubeClient, err := clientFactory.Client()
×
NEW
94
                        if err != nil {
×
NEW
95
                                return fmt.Errorf("initializing kubernetes client: %w", err)
×
NEW
96
                        }
×
97

NEW
98
                        return runIgnore(cmd.Context(), cmd, kubeClient, args, ctx, false)
×
99
                },
100
        }
101

102
        return cmd
1✔
103
}
104

105
func runIgnore(ctx context.Context, cmd *cobra.Command, kubeClient *client.Client, nodePatterns []string, cliCtx *cliContext.CLIContext, ignore bool) error {
1✔
106
        // Get all nodes
1✔
107
        nodeList, err := kubeClient.Kubernetes().CoreV1().Nodes().List(ctx, metav1.ListOptions{})
1✔
108
        if err != nil {
1✔
NEW
109
                return fmt.Errorf("listing nodes: %w", err)
×
NEW
110
        }
×
111

112
        // Collect all node names for pattern matching
113
        allNodeNames := make([]string, 0, len(nodeList.Items))
1✔
114
        nodeMap := make(map[string]int) // node name -> index in nodeList.Items
1✔
115
        for i, node := range nodeList.Items {
2✔
116
                allNodeNames = append(allNodeNames, node.Name)
1✔
117
                nodeMap[node.Name] = i
1✔
118
        }
1✔
119

120
        // Match nodes
121
        matchedNodes, err := utils.MatchNodes(nodePatterns, allNodeNames)
1✔
122
        if err != nil {
1✔
NEW
123
                return fmt.Errorf("matching nodes: %w", err)
×
NEW
124
        }
×
125

126
        if len(matchedNodes) == 0 {
2✔
127
                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "No nodes matched the specified patterns\n")
1✔
128
                return nil
1✔
129
        }
1✔
130

131
        action := "Ignoring"
1✔
132
        if !ignore {
2✔
133
                action = "Unignoring"
1✔
134
        }
1✔
135

136
        _, _ = fmt.Fprintf(cmd.OutOrStdout(), "%s %d node(s):\n", action, len(matchedNodes))
1✔
137
        for _, nodeName := range matchedNodes {
2✔
138
                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "  - %s\n", nodeName)
1✔
139
        }
1✔
140

141
        // Dry run check
142
        if cliCtx.GlobalFlags.DryRun {
2✔
143
                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "\n[dry-run] No changes applied\n")
1✔
144
                return nil
1✔
145
        }
1✔
146

147
        // Apply changes
148
        var updateErrors []string
1✔
149
        successCount := 0
1✔
150

1✔
151
        for _, nodeName := range matchedNodes {
2✔
152
                idx := nodeMap[nodeName]
1✔
153
                node := &nodeList.Items[idx]
1✔
154

1✔
155
                var err error
1✔
156
                if ignore {
2✔
157
                        // Check if already ignored
1✔
158
                        if val, ok := node.Labels[utils.NodeIgnoreLabel]; ok && val == labelValueTrue {
2✔
159
                                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "  %s: already ignored\n", nodeName)
1✔
160
                                continue
1✔
161
                        }
162
                        err = utils.SetNodeLabel(ctx, kubeClient.Kubernetes(), nodeName, utils.NodeIgnoreLabel, labelValueTrue)
1✔
163
                } else {
1✔
164
                        // Check if not ignored
1✔
165
                        if val, ok := node.Labels[utils.NodeIgnoreLabel]; !ok || val != labelValueTrue {
2✔
166
                                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "  %s: not ignored\n", nodeName)
1✔
167
                                continue
1✔
168
                        }
169
                        err = utils.RemoveNodeLabel(ctx, kubeClient.Kubernetes(), nodeName, utils.NodeIgnoreLabel)
1✔
170
                }
171

172
                if err != nil {
1✔
NEW
173
                        updateErrors = append(updateErrors, fmt.Sprintf("%s: %v", nodeName, err))
×
NEW
174
                        continue
×
175
                }
176
                successCount++
1✔
177
        }
178

179
        // Print results
180
        if len(updateErrors) > 0 {
1✔
NEW
181
                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "\nErrors updating some nodes:\n")
×
NEW
182
                for _, e := range updateErrors {
×
NEW
183
                        _, _ = fmt.Fprintf(cmd.OutOrStdout(), "  - %s\n", e)
×
NEW
184
                }
×
185
        }
186

187
        if successCount > 0 {
2✔
188
                verb := "ignored"
1✔
189
                if !ignore {
2✔
190
                        verb = "unignored"
1✔
191
                }
1✔
192
                _, _ = fmt.Fprintf(cmd.OutOrStdout(), "\nSuccessfully %s %d node(s)\n", verb, successCount)
1✔
193
        }
194

195
        return nil
1✔
196
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc