• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Financial-Times / upp-aggregate-healthcheck / 1199

05 Sep 2025 07:33AM UTC coverage: 64.679% (-0.4%) from 65.118%
1199

push

circleci

web-flow
Merge pull request #120 from Financial-Times/fix/UPPSF-6412-implement-retry-logic-in-upp-aggregate-healthcheck

Added attempts to getHealthChecksForPod func

21 of 55 new or added lines in 5 files covered. (38.18%)

3 existing lines in 3 files now uncovered.

976 of 1509 relevant lines covered (64.68%)

2.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.87
/controller.go
1
package main
2

3
import (
4
        "context"
5
        "fmt"
6
        "sort"
7
        "sync"
8
        "time"
9

10
        fthealth "github.com/Financial-Times/go-fthealth/v1_1"
11
        log "github.com/Financial-Times/go-logger"
12
)
13

14
type healthCheckController struct {
15
        healthCheckService             healthcheckService
16
        environment                    string
17
        measuredServices               map[string]measuredService
18
        stickyCategoriesFailedServices map[string]int
19
}
20

21
type controller interface {
22
        buildServicesHealthResult(context.Context, []string, bool) (fthealth.HealthResult, map[string]category, error)
23
        runServiceChecksByServiceNames(context.Context, map[string]service, map[string]category) ([]fthealth.CheckResult, error)
24
        runServiceChecksFor(context.Context, map[string]category) ([]fthealth.CheckResult, error)
25
        buildPodsHealthResult(context.Context, string) (fthealth.HealthResult, error)
26
        runPodChecksFor(context.Context, string) ([]fthealth.CheckResult, error)
27
        collectChecksFromCachesFor(context.Context, map[string]category) ([]fthealth.CheckResult, error)
28
        updateCachedHealth(context.Context, map[string]service, map[string]category)
29
        scheduleCheck(measuredService, time.Duration, *time.Timer)
30
        getIndividualPodHealth(context.Context, string) ([]byte, string, error)
31
        addAck(context.Context, string, string) error
32
        updateStickyCategory(context.Context, string, bool) error
33
        removeAck(context.Context, string) error
34
        getEnvironment() string
35
        getSeverityForService(context.Context, string, int32) uint8
36
        getSeverityForPod(context.Context, string, int32) uint8
37
        getMeasuredServices() map[string]measuredService
38
}
39

NEW
40
func initializeController(environment string, maxCheckAttempts int, checkCooldown time.Duration) *healthCheckController {
×
NEW
41
        service := initializeHealthCheckService(maxCheckAttempts, checkCooldown)
×
42
        measuredServices := make(map[string]measuredService)
×
43
        stickyCategoriesFailedServices := make(map[string]int)
×
44

×
45
        return &healthCheckController{
×
46
                healthCheckService:             service,
×
47
                environment:                    environment,
×
48
                measuredServices:               measuredServices,
×
49
                stickyCategoriesFailedServices: stickyCategoriesFailedServices,
×
50
        }
×
51
}
×
52

53
func (c *healthCheckController) getEnvironment() string {
1✔
54
        return c.environment
1✔
55
}
1✔
56

57
func (c *healthCheckController) updateStickyCategory(ctx context.Context, categoryName string, isEnabled bool) error {
2✔
58
        return c.healthCheckService.updateCategory(ctx, categoryName, isEnabled)
2✔
59
}
2✔
60

61
func (c *healthCheckController) removeAck(ctx context.Context, serviceName string) error {
3✔
62
        if !c.healthCheckService.isServicePresent(serviceName) {
4✔
63
                return fmt.Errorf("cannot find service with name %s", serviceName)
1✔
64
        }
1✔
65

66
        err := c.healthCheckService.removeAck(ctx, serviceName)
2✔
67

2✔
68
        if err != nil {
3✔
69
                return fmt.Errorf("failed to remove ack for service %s: %s", serviceName, err.Error())
1✔
70
        }
1✔
71

72
        return nil
1✔
73
}
74

75
func (c *healthCheckController) addAck(ctx context.Context, serviceName, ackMessage string) error {
3✔
76
        if !c.healthCheckService.isServicePresent(serviceName) {
4✔
77
                return fmt.Errorf("cannot find service with name %s", serviceName)
1✔
78
        }
1✔
79

80
        err := c.healthCheckService.addAck(ctx, serviceName, ackMessage)
2✔
81

2✔
82
        if err != nil {
3✔
83
                return fmt.Errorf("failed to add ack message [%s] for service %s: %s", ackMessage, serviceName, err.Error())
1✔
84
        }
1✔
85

86
        return nil
1✔
87
}
88

89
func (c *healthCheckController) buildServicesHealthResult(ctx context.Context, providedCategories []string, useCache bool) (fthealth.HealthResult, map[string]category, error) {
5✔
90
        var checkResults []fthealth.CheckResult
5✔
91
        desc := "Health of the whole cluster of the moment served without cache."
5✔
92
        availableCategories, err := c.healthCheckService.getCategories(ctx)
5✔
93
        if err != nil {
5✔
94
                return fthealth.HealthResult{}, nil, fmt.Errorf("cannot build health check result for services: %v", err.Error())
×
95
        }
×
96

97
        matchingCategories := getMatchingCategories(providedCategories, availableCategories)
5✔
98

5✔
99
        if useCache {
7✔
100
                desc = "Health of the whole cluster served from cache."
2✔
101
                checkResults, err = c.collectChecksFromCachesFor(ctx, matchingCategories)
2✔
102
        } else {
5✔
103
                checkResults, err = c.runServiceChecksFor(ctx, matchingCategories)
3✔
104
        }
3✔
105
        if err != nil {
6✔
106
                return fthealth.HealthResult{}, nil, fmt.Errorf("cannot build health check result for services: %v", err.Error())
1✔
107
        }
1✔
108

109
        c.disableStickyFailingCategories(ctx, matchingCategories, checkResults)
4✔
110

4✔
111
        finalOk, finalSeverity := getFinalResult(checkResults, matchingCategories)
4✔
112

4✔
113
        health := fthealth.HealthResult{
4✔
114
                SystemCode:    c.environment,
4✔
115
                Checks:        checkResults,
4✔
116
                Description:   desc,
4✔
117
                Name:          c.environment + " cluster health",
4✔
118
                SchemaVersion: 1,
4✔
119
                Ok:            finalOk,
4✔
120
                Severity:      finalSeverity,
4✔
121
        }
4✔
122

4✔
123
        sort.Sort(byNameComparator(health.Checks))
4✔
124

4✔
125
        return health, matchingCategories, nil
4✔
126
}
127

128
func (c *healthCheckController) runServiceChecksByServiceNames(ctx context.Context, services map[string]service, categories map[string]category) ([]fthealth.CheckResult, error) {
5✔
129
        deployments, err := c.healthCheckService.getDeployments(ctx)
5✔
130
        if err != nil {
6✔
131
                return nil, err
1✔
132
        }
1✔
133

134
        checks := make([]fthealth.Check, 0, len(services))
4✔
135
        for _, service := range services {
12✔
136
                check := newServiceHealthCheck(ctx, service, deployments, c.healthCheckService)
8✔
137
                checks = append(checks, check)
8✔
138
        }
8✔
139

140
        healthChecks := fthealth.RunCheck(fthealth.HealthCheck{
4✔
141
                SystemCode:  "aggregate-healthcheck",
4✔
142
                Name:        "Aggregate Healthcheck",
4✔
143
                Description: "Forced check run",
4✔
144
                Checks:      checks,
4✔
145
        }).Checks
4✔
146

4✔
147
        wg := sync.WaitGroup{}
4✔
148
        tempCtx, cancel := context.WithCancel(context.Background())
4✔
149
        defer cancel()
4✔
150
        for i := range healthChecks {
12✔
151
                wg.Add(1)
8✔
152
                go func(context context.Context, i int) {
16✔
153
                        healthCheck := healthChecks[i]
8✔
154
                        if !healthCheck.Ok {
16✔
155
                                if unhealthyService, ok := services[healthCheck.Name]; ok {
16✔
156
                                        severity := c.getSeverityForService(context, healthCheck.Name, unhealthyService.appPort)
8✔
157
                                        healthChecks[i].Severity = severity
8✔
158
                                } else {
8✔
159
                                        log.Warnf("Cannot compute severity for service with name %s because it was not found. Using default value.", healthCheck.Name)
×
160
                                }
×
161
                        }
162
                        wg.Done()
8✔
163
                }(tempCtx, i)
164
        }
165
        wg.Wait()
4✔
166

4✔
167
        for _, service := range services {
12✔
168
                if service.ack != "" {
12✔
169
                        updateHealthCheckWithAckMsg(healthChecks, service.name, service.ack)
4✔
170
                }
4✔
171
        }
172

173
        c.updateCachedHealth(tempCtx, services, categories)
4✔
174
        return healthChecks, nil
4✔
175
}
176

177
func (c *healthCheckController) runServiceChecksFor(ctx context.Context, categories map[string]category) (healthChecks []fthealth.CheckResult, err error) {
3✔
178
        serviceNames := getServiceNamesFromCategories(categories)
3✔
179
        services := c.healthCheckService.getServicesMapByNames(serviceNames)
3✔
180
        healthChecks, err = c.runServiceChecksByServiceNames(ctx, services, categories)
3✔
181
        if err != nil {
4✔
182
                return nil, err
1✔
183
        }
1✔
184

185
        return healthChecks, err
2✔
186
}
187

188
//nolint:gocognit
189
func (c *healthCheckController) disableStickyFailingCategories(ctx context.Context, categories map[string]category, healthChecks []fthealth.CheckResult) {
8✔
190
        for catIndex, category := range categories {
16✔
191
                if !isEnabledAndSticky(category) {
13✔
192
                        continue
5✔
193
                }
194

195
                for _, serviceName := range category.services {
6✔
196
                        for _, healthCheck := range healthChecks {
12✔
197
                                if healthCheck.Name == serviceName && !healthCheck.Ok {
11✔
198
                                        c.stickyCategoriesFailedServices[serviceName]++
2✔
199
                                        log.Infof("Sticky category [%s] is unhealthy -- check %v/%v.", category.name, c.stickyCategoriesFailedServices[serviceName], category.failureThreshold)
2✔
200

2✔
201
                                        if c.isCategoryThresholdExceeded(serviceName, category.failureThreshold) {
3✔
202
                                                log.Infof("Sticky category [%s] is unhealthy, disabling it.", category.name)
1✔
203
                                                category.isEnabled = false
1✔
204
                                                categories[catIndex] = category
1✔
205

1✔
206
                                                err := c.healthCheckService.updateCategory(ctx, category.name, false)
1✔
207
                                                if err != nil {
1✔
208
                                                        log.WithError(err).Errorf("Cannot disable sticky category with name %s.", category.name)
×
209
                                                } else {
1✔
210
                                                        log.Infof("Category [%s] disabled", category.name)
1✔
211
                                                        c.stickyCategoriesFailedServices[serviceName] = 0
1✔
212
                                                }
1✔
213
                                        }
214
                                }
215
                        }
216
                }
217
        }
218
}
219

220
func (c *healthCheckController) isCategoryThresholdExceeded(serviceName string, failureThreshold int) bool {
2✔
221
        return c.stickyCategoriesFailedServices[serviceName] >= failureThreshold
2✔
222
}
2✔
223

224
func isEnabledAndSticky(category category) bool {
8✔
225
        return category.isSticky && category.isEnabled
8✔
226
}
8✔
227

228
func updateHealthCheckWithAckMsg(healthChecks []fthealth.CheckResult, name string, ackMsg string) {
4✔
229
        for i, healthCheck := range healthChecks {
8✔
230
                if healthCheck.Name == name {
8✔
231
                        healthChecks[i].Ack = ackMsg
4✔
232
                        return
4✔
233
                }
4✔
234
        }
235
}
236

237
func getFinalResult(checkResults []fthealth.CheckResult, categories map[string]category) (bool, uint8) {
7✔
238
        finalOk := true
7✔
239
        finalSeverity := defaultSeverity
7✔
240

7✔
241
        if len(checkResults) == 0 {
8✔
242
                return false, finalSeverity
1✔
243
        }
1✔
244

245
        for _, category := range categories {
8✔
246
                if !category.isEnabled {
4✔
247
                        finalOk = false
2✔
248
                }
2✔
249
        }
250

251
        for _, checkResult := range checkResults {
17✔
252
                if !checkResult.Ok && checkResult.Ack == "" {
16✔
253
                        finalOk = false
5✔
254

5✔
255
                        if checkResult.Severity < finalSeverity {
6✔
256
                                finalSeverity = checkResult.Severity
1✔
257
                        }
1✔
258
                }
259
        }
260

261
        return finalOk, finalSeverity
6✔
262
}
263

264
func getMatchingCategories(providedCategories []string, availableCategories map[string]category) map[string]category {
6✔
265
        result := make(map[string]category)
6✔
266
        for _, providedCat := range providedCategories {
15✔
267
                if _, ok := availableCategories[providedCat]; ok {
10✔
268
                        result[providedCat] = availableCategories[providedCat]
1✔
269
                }
1✔
270
        }
271

272
        return result
6✔
273
}
274

275
func getServiceNamesFromCategories(categories map[string]category) []string {
7✔
276
        var services []string
7✔
277

7✔
278
        if _, ok := categories["default"]; ok {
8✔
279
                return services
1✔
280
        }
1✔
281

282
        for categoryName := range categories {
8✔
283
                servicesForCategory := categories[categoryName].services
2✔
284
                for _, service := range servicesForCategory {
6✔
285
                        if !isStringInSlice(service, services) {
7✔
286
                                services = append(services, service)
3✔
287
                        }
3✔
288
                }
289
        }
290

291
        return services
6✔
292
}
293

294
func isStringInSlice(a string, list []string) bool {
20✔
295
        for _, b := range list {
25✔
296
                if b == a {
6✔
297
                        return true
1✔
298
                }
1✔
299
        }
300

301
        return false
19✔
302
}
303

304
// used for sorting checks
305
type byNameComparator []fthealth.CheckResult
306

307
func (s byNameComparator) Less(i, j int) bool {
5✔
308
        return s[i].Name < s[j].Name
5✔
309
}
5✔
310

311
func (s byNameComparator) Len() int {
5✔
312
        return len(s)
5✔
313
}
5✔
314
func (s byNameComparator) Swap(i, j int) {
1✔
315
        s[i], s[j] = s[j], s[i]
1✔
316
}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc