• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

SpiNNakerManchester / JavaSpiNNaker / 6310285782

26 Sep 2023 08:47AM UTC coverage: 36.367% (-0.5%) from 36.866%
6310285782

Pull #658

github

dkfellows
Merge branch 'master' into java-17
Pull Request #658: Update Java version to 17 and JEE to 9

1675 of 1675 new or added lines in 266 files covered. (100.0%)

8368 of 23010 relevant lines covered (36.37%)

0.36 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

62.47
/SpiNNaker-allocserv/src/main/java/uk/ac/manchester/spinnaker/alloc/bmp/BMPController.java
1
/*
2
 * Copyright (c) 2021 The University of Manchester
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *     https://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
package uk.ac.manchester.spinnaker.alloc.bmp;
17

18
import static java.lang.String.format;
19
import static java.lang.Thread.currentThread;
20
import static java.lang.Thread.sleep;
21
import static java.time.Instant.now;
22
import static java.util.Objects.requireNonNull;
23
import static org.slf4j.LoggerFactory.getLogger;
24
import static uk.ac.manchester.spinnaker.alloc.bmp.NonBootOperation.GET_SERIAL;
25
import static uk.ac.manchester.spinnaker.alloc.bmp.NonBootOperation.READ_BL;
26
import static uk.ac.manchester.spinnaker.alloc.bmp.NonBootOperation.READ_TEMP;
27
import static uk.ac.manchester.spinnaker.alloc.bmp.NonBootOperation.WRITE_BL;
28
import static uk.ac.manchester.spinnaker.alloc.model.JobState.DESTROYED;
29
import static uk.ac.manchester.spinnaker.alloc.model.JobState.QUEUED;
30
import static uk.ac.manchester.spinnaker.alloc.model.JobState.READY;
31

32
import java.io.IOException;
33
import java.lang.Thread.UncaughtExceptionHandler;
34
import java.util.ArrayList;
35
import java.util.Collection;
36
import java.util.HashMap;
37
import java.util.LinkedList;
38
import java.util.List;
39
import java.util.Map;
40
import java.util.Optional;
41
import java.util.function.Consumer;
42
import java.util.stream.Collectors;
43

44
import org.slf4j.Logger;
45
import org.springframework.beans.factory.ObjectProvider;
46
import org.springframework.beans.factory.annotation.Autowired;
47
import org.springframework.jmx.export.annotation.ManagedResource;
48
import org.springframework.scheduling.TaskScheduler;
49
import org.springframework.scheduling.concurrent.ThreadPoolTaskScheduler;
50
import org.springframework.stereotype.Service;
51

52
import com.google.errorprone.annotations.RestrictedApi;
53
import com.google.errorprone.annotations.concurrent.GuardedBy;
54

55
import jakarta.annotation.PostConstruct;
56
import uk.ac.manchester.spinnaker.alloc.ForTestingOnly;
57
import uk.ac.manchester.spinnaker.alloc.ServiceMasterControl;
58
import uk.ac.manchester.spinnaker.alloc.SpallocProperties.AllocatorProperties;
59
import uk.ac.manchester.spinnaker.alloc.SpallocProperties.TxrxProperties;
60
import uk.ac.manchester.spinnaker.alloc.admin.ReportMailSender;
61
import uk.ac.manchester.spinnaker.alloc.allocator.AllocatorTask;
62
import uk.ac.manchester.spinnaker.alloc.allocator.Epochs;
63
import uk.ac.manchester.spinnaker.alloc.allocator.SpallocAPI;
64
import uk.ac.manchester.spinnaker.alloc.db.DatabaseAPI.Connection;
65
import uk.ac.manchester.spinnaker.alloc.db.DatabaseAwareBean;
66
import uk.ac.manchester.spinnaker.alloc.db.Row;
67
import uk.ac.manchester.spinnaker.alloc.model.Direction;
68
import uk.ac.manchester.spinnaker.alloc.model.JobState;
69
import uk.ac.manchester.spinnaker.machine.board.BMPBoard;
70
import uk.ac.manchester.spinnaker.machine.board.BMPCoords;
71
import uk.ac.manchester.spinnaker.machine.board.HasBMPLocation;
72
import uk.ac.manchester.spinnaker.messages.model.ADCInfo;
73
import uk.ac.manchester.spinnaker.messages.model.Blacklist;
74
import uk.ac.manchester.spinnaker.transceiver.ProcessException;
75
import uk.ac.manchester.spinnaker.transceiver.ProcessException.CallerProcessException;
76
import uk.ac.manchester.spinnaker.transceiver.ProcessException.PermanentProcessException;
77
import uk.ac.manchester.spinnaker.transceiver.ProcessException.TransientProcessException;
78
import uk.ac.manchester.spinnaker.transceiver.SpinnmanException;
79
import uk.ac.manchester.spinnaker.utils.UsedInJavadocOnly;
80

81
/**
82
 * Manages the BMPs of machines controlled by Spalloc.
83
 *
84
 * @author Donal Fellows
85
 */
86
@Service("bmpController")
87
@ManagedResource("Spalloc:type=BMPController,name=bmpController")
88
public class BMPController extends DatabaseAwareBean {
1✔
89
        private static final Logger log = getLogger(BMPController.class);
1✔
90

91
        @Autowired
92
        private SpallocAPI spallocCore;
93

94
        @Autowired
95
        private ServiceMasterControl serviceControl;
96

97
        @Autowired
98
        private Epochs epochs;
99

100
        @Autowired
101
        private TxrxProperties props;
102

103
        @Autowired
104
        private PhysicalSerialMapping phySerMap;
105

106
        @Autowired
107
        private AllocatorProperties allocProps;
108

109
        @Autowired
110
        private ReportMailSender emailSender;
111

112
        @Autowired
113
        private AllocatorTask allocator;
114

115
        private TaskScheduler scheduler;
116

117
        /**
118
         * Map from BMP ID to worker task that handles it.
119
         */
120
        private final Map<Integer, Worker> workers = new HashMap<>();
1✔
121

122
        /**
123
         * Factory for {@linkplain SpiNNakerControl controllers}. Only use via
124
         * {@link #controllerFactory}.
125
         */
126
        @Autowired
127
        private ObjectProvider<SpiNNakerControl> controllerFactoryBean;
128

129
        /**
130
         * Type-safe factory for {@linkplain SpiNNakerControl controllers}.
131
         */
132
        private SpiNNakerControl.Factory controllerFactory;
133

134
        @GuardedBy("this")
135
        private Throwable bmpProcessingException;
136

137
        /**
138
         * An {@link UncaughtExceptionHandler}.
139
         *
140
         * @param thread
141
         *            The thread with the problem.
142
         * @param exception
143
         *            The exception that describes the problem.
144
         */
145
        @UsedInJavadocOnly(UncaughtExceptionHandler.class)
146
        private void handleException(Thread thread, Throwable exception) {
147
                log.error("uncaught exception in BMP worker {}", thread, exception);
×
148
        }
×
149

150
        // ----------------------------------------------------------------
151

152
        @PostConstruct
153
        private void init() {
154
                // Set up scheduler
155
                var sched = new ThreadPoolTaskScheduler();
1✔
156
                scheduler = sched;
1✔
157
                sched.setThreadGroupName("BMP");
1✔
158

159
                controllerFactory = controllerFactoryBean::getObject;
1✔
160
                allocator.setBMPController(this);
1✔
161

162
                // We do the making of workers later in tests
163
                List<Worker> madeWorkers = null;
1✔
164
                if (!serviceControl.isUseDummyBMP()) {
1✔
165
                        madeWorkers = makeWorkers();
×
166
                }
167

168
                // Set the pool size to match the number of workers
169
                if (workers.size() > 1) {
1✔
170
                        sched.setPoolSize(workers.size());
×
171
                }
172

173
                // Launch the scheduler now it is all set up
174
                sched.initialize();
1✔
175

176
                // And now use the scheduler
177
                if (madeWorkers != null) {
1✔
178
                        for (var worker : madeWorkers) {
×
179
                                scheduler.scheduleAtFixedRate(worker, allocProps.getPeriod());
×
180
                        }
×
181
                }
182
        }
1✔
183

184
        private List<Worker> makeWorkers() {
185
                // Make workers
186
                try (var c = getConnection();
1✔
187
                                var getBmps = c.query(GET_ALL_BMPS);
1✔
188
                                var getBoards = c.query(GET_ALL_BMP_BOARDS)) {
1✔
189
                        return c.transaction(false, () -> getBmps.call(row -> {
1✔
190
                                var m = spallocCore.getMachine(row.getString("machine_name"),
1✔
191
                                                true);
192
                                var coords = new BMPCoords(row.getInt("cabinet"),
1✔
193
                                                row.getInt("frame"));
1✔
194
                                var boards = new HashMap<BMPBoard, String>();
1✔
195
                                var bmpId = row.getInt("bmp_id");
1✔
196
                                getBoards.call(r -> {
1✔
197
                                        boards.put(new BMPBoard(r.getInt("board_num")),
1✔
198
                                                        r.getString("address"));
1✔
199
                                        return null;
1✔
200
                                }, bmpId);
1✔
201
                                var control = controllerFactory.create(m.get(), coords, boards);
1✔
202
                                var worker = new Worker(control, bmpId);
1✔
203
                                workers.put(row.getInt("bmp_id"), worker);
1✔
204
                                return worker;
1✔
205
                        }));
206
                }
207
        }
208

209
        /**
210
         * Trigger the execution of the workers for the given BMPs now.
211
         *
212
         * @param bmps
213
         *            A list of BMPs that have changed.
214
         */
215
        public void triggerSearch(Collection<Integer> bmps) {
216
                for (var b : bmps) {
1✔
217
                        var worker = workers.get(b);
1✔
218
                        if (worker != null) {
1✔
219
                                scheduler.schedule(worker::run, now());
1✔
220
                        } else {
221
                                log.error("Could not find worker for BMP {}", b);
1✔
222
                        }
223
                }
1✔
224
        }
1✔
225

226
        /** An action that may throw any of a range of exceptions. */
227
        private interface ThrowingAction {
228
                void act() throws ProcessException, IOException, InterruptedException;
229
        }
230

231
        private abstract sealed class Request
232
                        permits BoardRequest, PowerRequest {
233
                final int bmpId;
234

235
                private int numTries = 0;
1✔
236

237
                Request(int bmpId) {
1✔
238
                        this.bmpId = bmpId;
1✔
239
                }
1✔
240

241
                /**
242
                 * @return Whether this request may be repeated.
243
                 */
244
                boolean isRepeat() {
245
                        return numTries < props.getPowerAttempts();
1✔
246
                }
247

248
                /**
249
                 * Basic machinery for handling exceptions that arise while performing a
250
                 * BMP action. Runs on a thread that may touch a BMP directly, but which
251
                 * may not touch the database.
252
                 * <p>
253
                 * Only subclasses should use this!
254
                 *
255
                 * @param body
256
                 *            What to attempt.
257
                 * @param onFailure
258
                 *            What to do on failure.
259
                 * @param onServiceRemove
260
                 *            If the exception looks serious, call this to trigger a
261
                 *            board being taken out of service.
262
                 * @return Whether to stop the retry loop.
263
                 * @throws InterruptedException
264
                 *             If interrupted.
265
                 */
266
                final boolean bmpAction(ThrowingAction body,
267
                                Consumer<Exception> onFailure,
268
                                Consumer<PermanentProcessException> onServiceRemove)
269
                                throws InterruptedException {
270
                        boolean isLastTry = numTries++ >= props.getPowerAttempts();
1✔
271
                        Exception exn;
272
                        try {
273
                                body.act();
1✔
274
                                // Exit the retry loop (up the stack); the requests all worked
275
                                return true;
1✔
276
                        } catch (InterruptedException e) {
×
277
                                /*
278
                                 * We were interrupted! This happens when we're shutting down.
279
                                 * Log (because we're in an inconsistent state) and rethrow so
280
                                 * that the outside gets to clean up.
281
                                 */
282
                                log.error("Requests failed on BMP {} because of "
×
283
                                                + "interruption", bmpId, e);
×
284
                                currentThread().interrupt();
×
285
                                throw e;
×
286
                        } catch (TransientProcessException e) {
×
287
                                if (!isLastTry) {
×
288
                                        // Log somewhat gently; we *might* be able to recover...
289
                                        log.warn("Retrying requests on BMP {} after {}: {}",
×
290
                                                        bmpId, props.getProbeInterval(),
×
291
                                                        e.getMessage());
×
292
                                        // Ask for a retry
293
                                        return false;
×
294
                                }
295
                                exn = e;
×
296
                                log.error("Requests failed on BMP {}", bmpId, e);
×
297
                        } catch (PermanentProcessException e) {
×
298
                                log.error("BMP {} on {} is unreachable", e.source, bmpId, e);
×
299
                                onServiceRemove.accept(e);
×
300
                                exn = e;
×
301
                        } catch (CallerProcessException e) {
×
302
                                // This is probably a software bug
303
                                log.error("SW bug talking to BMP {}", bmpId, e);
×
304
                                exn = e;
×
305
                        } catch (ProcessException | IOException | RuntimeException e) {
×
306
                                log.error("Requests failed on BMP {}", bmpId, e);
×
307
                                exn = e;
×
308
                        }
×
309
                        /*
310
                         * Common permanent failure handling case; arrange for taking a
311
                         * board out of service, mark a request as failed, and stop the
312
                         * retry loop.
313
                         */
314
                        onFailure.accept(exn);
×
315
                        return true;
×
316
                }
317

318
                /**
319
                 * Add a report to the database of a problem with a board.
320
                 *
321
                 * @param sql
322
                 *            How to talk to the DB
323
                 * @param boardId
324
                 *            Which board has the problem
325
                 * @param jobId
326
                 *            What job was associated with the problem (if any)
327
                 * @param msg
328
                 *            Information about what the problem was
329
                 */
330
                final void addBoardReport(Connection c, int boardId, Integer jobId,
331
                                String msg) {
332
                        try (var getUser = c.query(GET_USER_DETAILS_BY_NAME);
×
333
                                        var insertBoardReport = c.update(INSERT_BOARD_REPORT)) {
×
334
                                getUser.call1(row -> row.getInt("user_id"),
×
335
                                                allocProps.getSystemReportUser()).ifPresent(
×
336
                                                                userId -> insertBoardReport.call(
×
337
                                                                                boardId, jobId,        msg, userId));
×
338
                        }
339
                }
×
340

341
                /**
342
                 * Marks a board as actually dead, and requests we send email about it.
343
                 *
344
                 * @param sql
345
                 *            How to talk to the DB
346
                 * @param boardId
347
                 *            Which board has the problem
348
                 * @param msg
349
                 *            Information about what the problem was
350
                 * @return Whether we've successfully done a change.
351
                 */
352
                final void markBoardAsDead(Connection c, int boardId, String msg) {
353
                        try (var setFunctioning = c.update(SET_FUNCTIONING_FIELD);
×
354
                                        var findBoardById = c.query(FIND_BOARD_BY_ID)) {
×
355
                                boolean result = setFunctioning.call(false, boardId) > 0;
×
356
                                if (result) {
×
357
                                        findBoardById.call1(row -> {
×
358
                                                var ser = row.getString("physical_serial_id");
×
359
                                                if (ser == null) {
×
360
                                                        ser = "<UNKNOWN>";
×
361
                                                }
362
                                                var fullMessage = format(
×
363
                                                                "Marked board at %d,%d,%d of %s (serial: %s) "
364
                                                                                + "as dead: %s",
365
                                                                row.getInt("x"), row.getInt("y"),
×
366
                                                                row.getInt("z"), row.getString("machine_name"),
×
367
                                                                ser, msg);
368
                                                emailSender.sendServiceMail(fullMessage);
×
369
                                                return null;
×
370
                                        }, boardId);
×
371
                                }
372
                        }
373
                }
×
374

375
                boolean processRequest(SpiNNakerControl control) {
376
                        while (isRepeat()) {
1✔
377
                                try {
378
                                        if (tryProcessRequest(control)) {
1✔
379
                                                return true;
1✔
380
                                        }
381
                                        sleep(props.getProbeInterval().toMillis());
×
382
                                } catch (InterruptedException e) {
×
383
                                        // If this happens, just cancel the transaction;
384
                                        // when we come back, all things will be redone.
385
                                        throw new RuntimeException(e);
×
386
                                }
×
387
                        }
388
                        return false;
×
389
                }
390

391
                abstract boolean tryProcessRequest(SpiNNakerControl control)
392
                                throws InterruptedException;
393
        }
394

395
        /**
396
         * Describes a request to modify the power status of a collection of boards.
397
         * The boards must be on a single machine and must all be assigned to a
398
         * single job.
399
         * <p>
400
         * This is the message that is sent from the main thread to the per-BMP
401
         * worker threads.
402
         *
403
         * @author Donal Fellows
404
         */
405
        private final class PowerRequest extends Request {
406
                private final List<BMPBoard> powerOnBoards = new ArrayList<>();
1✔
407

408
                private final List<BMPBoard> powerOffBoards = new ArrayList<>();
1✔
409

410
                private final List<Link> linkRequests = new ArrayList<>();
1✔
411

412
                private final int jobId;
413

414
                private final JobState from;
415

416
                private final JobState to;
417

418
                private final List<Integer> changeIds = new ArrayList<>();
1✔
419

420
                private final Map<Integer, Integer> boardToId = new HashMap<>();
1✔
421

422
                /**
423
                 * Create a request.
424
                 *
425
                 * @param sql
426
                 *            How to access the database.
427
                 * @param machine
428
                 *            What machine are the boards on? <em>Must not</em> be
429
                 *            {@code null}.
430
                 * @param powerOn
431
                 *            What boards (by DB ID) are to be powered on? May be
432
                 *            {@code null}; that's equivalent to the empty list.
433
                 * @param powerOff
434
                 *            What boards (by DB ID) are to be powered off? May be
435
                 *            {@code null}; that's equivalent to the empty list.
436
                 * @param links
437
                 *            Any link power control requests. By default, links are on
438
                 *            if their board is on and they are connected; it is
439
                 *            <em>useful and relevant</em> to modify the power state of
440
                 *            links on the periphery of an allocation. May be
441
                 *            {@code null}; that's equivalent to the empty list.
442
                 * @param jobId
443
                 *            For what job is this?
444
                 * @param from
445
                 *            What state is the job moving from?
446
                 * @param to
447
                 *            What state is the job moving to?
448
                 * @param changeIds
449
                 *            The DB ids that describe the change, so we can update
450
                 *            those records.
451
                 * @param idToBoard
452
                 *            How to get the physical ID of a board from its database ID
453
                 */
454
                PowerRequest(int bmpId, int jobId, JobState from, JobState to,
455
                                List<PowerChange> powerChanges) {
1✔
456
                        super(bmpId);
1✔
457
                        for (var change : powerChanges) {
1✔
458
                                if (change.power()) {
1✔
459
                                        powerOnBoards.add(new BMPBoard(change.boardNum()));
1✔
460
                                } else {
461
                                        powerOffBoards.add(new BMPBoard(change.boardNum()));
1✔
462
                                }
463
                                change.offLinks().stream().forEach(link ->
1✔
464
                                                linkRequests.add(new Link(change.boardNum(), link)));
1✔
465
                                changeIds.add(change.changeId());
1✔
466
                                boardToId.put(change.boardNum(), change.boardId);
1✔
467
                        }
1✔
468
                        this.jobId = jobId;
1✔
469
                        this.from = from;
1✔
470
                        this.to = to;
1✔
471
                }
1✔
472

473
                /**
474
                 * Change the power state of boards in this request.
475
                 *
476
                 * @param controllers
477
                 *            How to actually communicate with the machine
478
                 * @throws ProcessException
479
                 *             If the transceiver chokes
480
                 * @throws InterruptedException
481
                 *             If interrupted
482
                 * @throws IOException
483
                 *             If network I/O fails
484
                 */
485
                void changeBoardPowerState(SpiNNakerControl controller)
486
                                throws ProcessException, InterruptedException, IOException {
487

488
                        // Send any power on commands
489
                        if (!powerOnBoards.isEmpty()) {
1✔
490
                                controller.powerOnAndCheck(powerOnBoards);
1✔
491
                        }
492

493
                        // Process perimeter link requests next
494
                        for (var linkReq : linkRequests) {
1✔
495
                                // Set the link state, as required
496
                                controller.setLinkOff(linkReq);
1✔
497
                        }
1✔
498

499
                        // Finally send any power off commands
500
                        if (!powerOffBoards.isEmpty()) {
1✔
501
                                controller.powerOff(powerOffBoards);
1✔
502
                        }
503
                }
1✔
504

505
                /**
506
                 * Handles the database changes after a set of changes to a BMP complete
507
                 * successfully. We will move the job to the state it supposed to be in.
508
                 *
509
                 * @param sql
510
                 *            How to access the DB
511
                 * @return Whether the state of boards or jobs has changed.
512
                 */
513
                private void done() {
514
                        try (var c = getConnection();
1✔
515
                                        var deallocateBoards = c.update(DEALLOCATE_BMP_BOARDS_JOB);
1✔
516
                                        var deleteChange = c.update(FINISHED_PENDING);
1✔
517
                                        var setBoardPowerOn = c.update(SET_BOARD_POWER_ON);
1✔
518
                                        var setBoardPowerOff = c.update(SET_BOARD_POWER_OFF)) {
1✔
519
                                c.transaction(() -> {
1✔
520
                                        int turnedOn = powerOnBoards.stream().map(this::getBoardId)
1✔
521
                                                        .mapToInt(setBoardPowerOn::call).sum();
1✔
522
                                        int turnedOff =
1✔
523
                                                        powerOffBoards.stream().map(this::getBoardId)
1✔
524
                                                                        .mapToInt(setBoardPowerOff::call).sum();
1✔
525

526
                                        if (to == DESTROYED || to == QUEUED) {
1✔
527
                                                /*
528
                                                 * Need to mark the boards as not allocated; can't do
529
                                                 * that until they've been switched off.
530
                                                 */
531
                                                deallocateBoards.call(jobId, bmpId);
1✔
532
                                        }
533
                                        int completed = changeIds.stream().mapToInt(
1✔
534
                                                        deleteChange::call).sum();
1✔
535

536
                                        log.debug("BMP ACTION SUCCEEDED ({}:{}->{}): on:{} off:{} "
1✔
537
                                                        + "completed: {}",
538
                                                        jobId, from, to, turnedOn, turnedOff, completed);
1✔
539
                                });
1✔
540
                        }
541

542
                        // Tell the allocator something has happened
543
                        allocator.updateJob(jobId, from, to);
1✔
544
                }
1✔
545

546
                /**
547
                 * Handles the database changes after a set of changes to a BMP complete
548
                 * with a failure. We will roll back the job state to what it was
549
                 * before.
550
                 *
551
                 * @param sql
552
                 *            How to access the DB
553
                 * @return Whether the state of boards or jobs has changed.
554
                 */
555
                private void failed() {
556
                        var resetJobAlloc = false;
×
557
                        try (var c = getConnection();
×
558
                                        var deallocateBoards = c.update(DEALLOCATE_BMP_BOARDS_JOB);
×
559
                                        var deleteChange = c.update(FINISHED_PENDING);
×
560
                                        var setBoardPowerOff = c.update(SET_BOARD_POWER_OFF)) {
×
561
                                resetJobAlloc = c.transaction(() -> {
×
562
                                        // We should mark the boards as off
563
                                        int turnedOff =
×
564
                                                        powerOffBoards.stream().map(this::getBoardId)
×
565
                                                                        .mapToInt(setBoardPowerOff::call).sum();
×
566

567
                                        // ... even those that we should be powering on ...
568
                                        turnedOff +=
×
569
                                                        powerOnBoards.stream().map(this::getBoardId)
×
570
                                                                        .mapToInt(setBoardPowerOff::call).sum();
×
571

572
                                        // Deallocate the boards on this bmp from the job;
573
                                        // other boards can be deallocated elsewhere.
574
                                        deallocateBoards.call(jobId, bmpId);
×
575

576
                                        // Delete change ids as they are done even if failed.
577
                                        var completed = changeIds.stream().mapToInt(
×
578
                                                        deleteChange::call).sum();
×
579

580
                                        log.debug(
×
581
                                                        "BMP ACTION FAILED on {} ({}:{}->{}) off:{} "
582
                                                        + "completed:{}",
583
                                                        bmpId, jobId, from, to, turnedOff, completed);
×
584

585
                                        // If we were meant to be powering up, reset the allocation
586
                                        // once done here.
587
                                        return (to == READY && powerOffBoards.isEmpty());
×
588
                                });
589
                        }
590
                        if (resetJobAlloc) {
×
591
                                allocator.resetPowerOnFailure(jobId);
×
592
                        }
593
                }
×
594

595
                /**
596
                 * Process an action to power on or off a set of boards. Runs on a
597
                 * thread that may touch a BMP directly, but which may not touch the
598
                 * database.
599
                 *
600
                 * @param controller
601
                 *            How to actually reach the BMPs.
602
                 * @return Whether this action has "succeeded" and shouldn't be retried.
603
                 * @throws InterruptedException
604
                 *             If interrupted.
605
                 */
606
                @Override
607
                boolean tryProcessRequest(SpiNNakerControl controller)
608
                                throws InterruptedException {
609
                        boolean ok = bmpAction(() -> {
1✔
610
                                changeBoardPowerState(controller);
1✔
611
                                // We want to ensure the lead board is alive
612
                                if (!serviceControl.isUseDummyBMP()) {
1✔
613
                                        // Don't bother with pings when the dummy is enabled
614
                                        controller.ping(powerOnBoards);
×
615
                                }
616
                                done();
1✔
617
                        }, e -> {
1✔
618
                                failed();
×
619
                                synchronized (BMPController.this) {
×
620
                                        bmpProcessingException = e;
×
621
                                }
×
622
                        }, ppe -> {
×
623
                                badBoard(ppe);
×
624
                        });
×
625
                        return ok;
1✔
626
                }
627

628
                @Override
629
                public String toString() {
630
                        var sb = new StringBuilder("PowerRequest(for=")
×
631
                                        .append(bmpId);
×
632
                        sb.append(";on=").append(powerOnBoards);
×
633
                        sb.append(",off=").append(powerOffBoards);
×
634
                        sb.append(",links=").append(linkRequests);
×
635
                        return sb.append(")").toString();
×
636
                }
637

638
                private static final String REPORT_MSG =
639
                                "board was not reachable when trying to power it: ";
640

641
                /**
642
                 * When a BMP is unroutable, we must tell the alloc engine to pick
643
                 * somewhere else, and we should mark the board as out of service too;
644
                 * it's never going to work so taking it out right away is the only sane
645
                 * plan. We also need to nuke the planned changes. Retrying is bad.
646
                 *
647
                 * @param failure
648
                 *            The failure message.
649
                 * @return Whether the state of boards or jobs has changed.
650
                 */
651
                private void badBoard(ProcessException failure) {
652
                        try (var c = getConnection()) {
×
653
                                c.transaction(() -> {
×
654
                                        getBoardId(failure.source).ifPresent(boardId -> {
×
655
                                                // Mark the board as dead right now
656
                                                markBoardAsDead(c, boardId, REPORT_MSG + failure);
×
657
                                                // Add a report if we can
658
                                                addBoardReport(c, boardId, jobId, REPORT_MSG + failure);
×
659
                                        });
×
660
                                });
×
661
                        }
662
                }
×
663

664
                /**
665
                 * Given a board address, get the ID that it corresponds to. Reverses
666
                 * {@link #idToBoard}.
667
                 *
668
                 * @param addr
669
                 *            The board address.
670
                 * @return The ID, if one can be found.
671
                 */
672
                private Optional<Integer> getBoardId(HasBMPLocation addr) {
673
                        return Optional.ofNullable(boardToId.get(addr.getBoard()));
×
674
                }
675

676
                private Integer getBoardId(BMPBoard board) {
677
                        return boardToId.get(board.board());
1✔
678
                }
679
        }
680

681
        /**
682
         * A request to read or write information on a BMP. Includes blacklists,
683
         * serial numbers, temperature data, etc.
684
         *
685
         * @author Donal Fellows
686
         */
687
        private final class BoardRequest extends Request {
688
                private final NonBootOperation op;
689

690
                private final int opId;
691

692
                private final int boardId;
693

694
                private final BMPCoords bmp;
695

696
                private final BMPBoard board;
697

698
                private final String bmpSerialId;
699

700
                private final Blacklist blacklist;
701

702
                private final int machineId;
703

704
                private BoardRequest(int bmpId, NonBootOperation op, Row row) {
1✔
705
                        super(bmpId);
1✔
706
                        this.op = op;
1✔
707
                        opId = row.getInt("op_id");
1✔
708
                        boardId = row.getInt("board_id");
1✔
709
                        bmp = new BMPCoords(row.getInt("cabinet"), row.getInt("frame"));
1✔
710
                        board = new BMPBoard(row.getInt("board_num"));
1✔
711
                        if (op == WRITE_BL) {
1✔
712
                                blacklist = row.getSerial("data", Blacklist.class);
1✔
713
                        } else {
714
                                blacklist = null;
1✔
715
                        }
716
                        bmpSerialId = row.getString("bmp_serial_id");
1✔
717
                        machineId = row.getInt("machine_id");
1✔
718
                }
1✔
719

720
                /** The serial number actually read from the board. */
721
                private String readSerial;
722

723
                /**
724
                 * Access the DB to store the serial number information that we
725
                 * retrieved. A transaction should already be held.
726
                 *
727
                 * @param c
728
                 *            How to access the DB
729
                 * @return Whether we've changed anything
730
                 */
731
                private void recordSerialIds(Connection c) {
732
                        try (var setBoardSerialIds = c.update(SET_BOARD_SERIAL_IDS)) {
1✔
733
                                setBoardSerialIds.call(boardId, readSerial,
1✔
734
                                                phySerMap.getPhysicalId(readSerial));
1✔
735
                        }
736
                }
1✔
737

738
                /**
739
                 * Access the DB to mark the read request as successful and store the
740
                 * blacklist that was read. A transaction should already be held.
741
                 *
742
                 * @param c
743
                 *            How to access the DB
744
                 * @param readBlacklist
745
                 *            The blacklist that was read
746
                 * @return Whether we've changed anything
747
                 */
748
                private void doneReadBlacklist(Connection c, Blacklist readBlacklist) {
749
                        try (var completed = c.update(COMPLETED_BOARD_INFO_READ)) {
1✔
750
                                log.debug("Completing blacklist read opId {}", opId);
1✔
751
                                completed.call(readBlacklist, opId);
1✔
752
                        }
753
                }
1✔
754

755
                /**
756
                 * Access the DB to mark the write request as successful. A transaction
757
                 * should already be held.
758
                 *
759
                 * @param c
760
                 *            How to access the DB
761
                 * @return Whether we've changed anything
762
                 */
763
                private void doneWriteBlacklist(Connection c) {
764
                        try (var completed = c.update(COMPLETED_BLACKLIST_WRITE)) {
1✔
765
                                completed.call(opId);
1✔
766
                        }
767
                }
1✔
768

769
                /**
770
                 * Access the DB to mark the read request as successful; the actual
771
                 * store of the serial data is elsewhere
772
                 * ({@link #recordSerialIds(Connection)}). A transaction should already
773
                 * be held.
774
                 *
775
                 * @param c
776
                 *            How to access the DB
777
                 * @return Whether we've changed anything
778
                 */
779
                private void doneReadSerial(Connection c) {
780
                        try (var completed = c.update(COMPLETED_GET_SERIAL_REQ)) {
1✔
781
                                completed.call(opId);
1✔
782
                        }
783
                }
1✔
784

785
                /**
786
                 * Access the DB to mark the read request as successful and store the
787
                 * ADC info that was read. A transaction should be held.
788
                 *
789
                 * @param c
790
                 *            The database connection.
791
                 */
792
                private void doneReadTemps(Connection c, ADCInfo adcInfo) {
793
                        try (var completed = c.update(COMPLETED_BOARD_INFO_READ)) {
×
794
                                log.debug("Completing temperature read opId {}", opId);
×
795
                                completed.call(adcInfo, opId);
×
796
                        }
797
                }
×
798

799
                /**
800
                 * Access the DB to mark the request as failed and store the exception.
801
                 *
802
                 * @param exn
803
                 *            The exception that caused the failure.
804
                 * @return Whether we've changed anything
805
                 */
806
                private void failed(Exception exn) {
807
                        try (var c = getConnection();
×
808
                                        var failed = c.update(FAILED_BLACKLIST_OP)) {
×
809
                                c.transaction(() -> failed.call(exn, opId));
×
810
                        }
811
                }
×
812

813
                private static final String REPORT_MSG =
814
                                "board was not reachable when trying to access its blacklist: ";
815

816
                /**
817
                 * Access the DB to mark a board as out of service.
818
                 *
819
                 * @param exn
820
                 *            The exception that caused the failure.
821
                 * @return Whether we've changed anything
822
                 */
823
                void takeOutOfService(Exception exn) {
824
                        try (var c = getConnection()) {
×
825
                                c.transaction(() -> {
×
826
                                        addBoardReport(c, boardId, null, REPORT_MSG + exn);
×
827
                                        markBoardAsDead(c, boardId, REPORT_MSG + exn);
×
828
                                });
×
829
                        }
830
                }
×
831

832
                /**
833
                 * Process an action to work with a blacklist or serial number. Runs on
834
                 * a thread that may touch a BMP directly, but which may not touch the
835
                 * database.
836
                 *
837
                 * @param controller
838
                 *            How to actually reach the BMP.
839
                 * @return Whether this action has "succeeded" and shouldn't be retried.
840
                 * @throws InterruptedException
841
                 *             If interrupted.
842
                 */
843
                @Override
844
                boolean tryProcessRequest(SpiNNakerControl controller)
845
                                throws InterruptedException {
846
                        return bmpAction(() -> {
1✔
847
                                switch (op) {
1✔
848
                                case WRITE_BL -> writeBlacklist(controller);
1✔
849
                                case READ_BL -> readBlacklist(controller);
1✔
850
                                case GET_SERIAL -> readSerial(controller);
1✔
851
                                case READ_TEMP ->  readTemps(controller);
×
852
                                default -> throw new IllegalArgumentException();
×
853
                                }
854
                                epochs.blacklistChanged(boardId);
1✔
855
                                epochs.machineChanged(machineId);
1✔
856
                        }, e -> {
1✔
857
                                failed(e);
×
858
                        }, ppe -> {
×
859
                                takeOutOfService(ppe);
×
860
                        });
×
861
                }
862

863
                /**
864
                 * Process an action to read a blacklist.
865
                 *
866
                 * @param controller
867
                 *            How to actually reach the BMP.
868
                 * @throws InterruptedException
869
                 *             If interrupted.
870
                 * @throws IOException
871
                 *             If the network is unhappy.
872
                 * @throws ProcessException
873
                 *             If the BMP rejects a message.
874
                 */
875
                private void readBlacklist(SpiNNakerControl controller)
876
                                throws InterruptedException, ProcessException, IOException {
877
                        readSerial = controller.readSerial(board);
1✔
878
                        if (bmpSerialId != null && !bmpSerialId.equals(readSerial)) {
1✔
879
                                /*
880
                                 * Doesn't match; WARN but keep going; hardware may just be
881
                                 * remapped behind our back.
882
                                 */
883
                                log.warn(
×
884
                                                "blacklist read mismatch: expected serial ID '{}' "
885
                                                                + "not equal to actual serial ID '{}'",
886
                                                bmpSerialId, readSerial);
887
                        }
888
                        var readBlacklist = controller.readBlacklist(board);
1✔
889
                        try (var c = getConnection()) {
1✔
890
                                c.transaction(() -> {
1✔
891
                                        recordSerialIds(c);
1✔
892
                                        doneReadBlacklist(c, readBlacklist);
1✔
893
                                });
1✔
894
                        }
895
                }
1✔
896

897
                /**
898
                 * Process an action to write a blacklist.
899
                 *
900
                 * @param controller
901
                 *            How to actually reach the BMP.
902
                 * @throws InterruptedException
903
                 *             If interrupted.
904
                 * @throws IOException
905
                 *             If the network is unhappy.
906
                 * @throws ProcessException
907
                 *             If the BMP rejects a message.
908
                 * @throws IllegalStateException
909
                 *             If the operation is applied to a board other than the one
910
                 *             that it is expected to apply to.
911
                 */
912
                private void writeBlacklist(SpiNNakerControl controller)
913
                                throws InterruptedException, ProcessException, IOException {
914
                        readSerial = controller.readSerial(board);
1✔
915
                        if (bmpSerialId != null && !bmpSerialId.equals(readSerial)) {
1✔
916
                                // Doesn't match, so REALLY unsafe to keep going!
917
                                throw new IllegalStateException(format(
×
918
                                                "aborting blacklist write: expected serial ID '%s' "
919
                                                                + "not equal to actual serial ID '%s'",
920
                                                bmpSerialId, readSerial));
921
                        }
922
                        controller.writeBlacklist(board, requireNonNull(blacklist));
1✔
923
                        try (var c = getConnection()) {
1✔
924
                                c.transaction(() -> doneWriteBlacklist(c));
1✔
925
                        }
926
                }
1✔
927

928
                /**
929
                 * Process an action to read the serial number from a BMP.
930
                 *
931
                 * @param controller
932
                 *            How to actually reach the BMP.
933
                 * @throws InterruptedException
934
                 *             If interrupted.
935
                 * @throws IOException
936
                 *             If the network is unhappy
937
                 * @throws ProcessException
938
                 *             If the BMP rejects a message.
939
                 */
940
                private void readSerial(SpiNNakerControl controller)
941
                                throws InterruptedException, ProcessException, IOException {
942
                        readSerial = controller.readSerial(board);
1✔
943
                        try (var c = getConnection()) {
1✔
944
                                c.transaction(() -> {
1✔
945
                                        recordSerialIds(c);
1✔
946
                                        doneReadSerial(c);
1✔
947
                                });
1✔
948
                        }
949
                }
1✔
950

951
                /**
952
                 * Process an action to read some temperature data.
953
                 *
954
                 * @param controller
955
                 *            How to actually reach the BMP.
956
                 * @throws InterruptedException
957
                 *             If interrupted.
958
                 * @throws IOException
959
                 *             If the network is unhappy.
960
                 * @throws ProcessException
961
                 *             If the BMP rejects a message.
962
                 */
963
                private void readTemps(SpiNNakerControl controller)
964
                                throws InterruptedException, ProcessException, IOException {
965
                        var adcInfo = controller.readTemp(board);
×
966
                        try (var c = getConnection()) {
×
967
                                c.transaction(() -> doneReadTemps(c, adcInfo));
×
968
                        }
969
                }
×
970

971
                @Override
972
                public String toString() {
973
                        var sb = new StringBuilder("BoardRequest(for ");
×
974
                        sb.append("bmp=").append(bmp);
×
975
                        sb.append(",board=").append(boardId);
×
976
                        sb.append(",op=").append(op);
×
977
                        return sb.append(")").toString();
×
978
                }
979
        }
980

981
        private record PowerChange(Integer changeId, int jobId, Integer boardId,
1✔
982
                        Integer boardNum, boolean power, JobState from, JobState to,
983
                        List<Direction> offLinks) {
984
                PowerChange(Row row) {
985
                        this(row.getInteger("change_id"), //
1✔
986
                                        row.getInt("job_id"), //
1✔
987
                                        row.getInteger("board_id"), //
1✔
988
                                        row.getInteger("board_num"), //
1✔
989
                                        row.getBoolean("power"),
1✔
990
                                        row.getEnum("from_state", JobState.class),
1✔
991
                                        row.getEnum("to_state", JobState.class),
1✔
992
                                        List.of(Direction.values()).stream()
1✔
993
                                                        .filter(link -> !row.getBoolean(link.columnName))
1✔
994
                                                        .collect(Collectors.toList()));
1✔
995
                }
1✔
996

997
                boolean isSameJob(PowerChange p) {
998
                        return p.jobId == jobId && p.from == from && p.to == to;
×
999
                }
1000
        }
1001

1002
        // ----------------------------------------------------------------
1003
        // WORKER IMPLEMENTATION
1004

1005
        /** A worker of a given BMP. */
1006
        private final class Worker implements Runnable {
1007
                /** What are we controlling? */
1008
                private final SpiNNakerControl control;
1009

1010
                /** Which boards are we looking at? */
1011
                private final int bmpId;
1012

1013
                Worker(SpiNNakerControl control, int bmpId) {
1✔
1014
                        this.control = control;
1✔
1015
                        this.bmpId = bmpId;
1✔
1016

1017
                        log.debug("Created worker for boards {}", bmpId);
1✔
1018
                }
1✔
1019

1020
                /**
1021
                 * Periodically call to update, or trigger externally.
1022
                 */
1023
                @Override
1024
                public synchronized void run() {
1025
                        log.trace("Searching for changes on BMP {}", bmpId);
1✔
1026

1027
                        try {
1028
                                var changes = getRequestedOperations();
1✔
1029
                                for (var change : changes) {
1✔
1030
                                        change.processRequest(control);
1✔
1031
                                }
1✔
1032
                        } catch (Exception e) {
×
1033
                                log.error("unhandled exception for BMP '{}'", bmpId, e);
×
1034
                        }
1✔
1035
                }
1✔
1036

1037
                /**
1038
                 * Get the things that we want the worker to do. <em>Be very
1039
                 * careful!</em> Because this necessarily involves the database, this
1040
                 * must not touch the BMP handle as those operations take a long time
1041
                 * and we absolutely must not have a transaction open at the same time.
1042
                 *
1043
                 * @return List of operations to perform.
1044
                 */
1045
                private List<Request> getRequestedOperations() {
1046
                        var requests = new ArrayList<Request>();
1✔
1047
                        try (var c = getConnection();
1✔
1048
                                        var getPowerRequests = c.query(GET_CHANGES);
1✔
1049
                                        var getBlacklistReads = c.query(GET_BLACKLIST_READS);
1✔
1050
                                        var getBlacklistWrites = c.query(GET_BLACKLIST_WRITES);
1✔
1051
                                        var getReadSerialInfos = c.query(GET_SERIAL_INFO_REQS);
1✔
1052
                                        var getReadTemps = c.query(GET_TEMP_INFO_REQS)) {
1✔
1053
                                c.transaction(false, () -> {
1✔
1054
                                        // Batch power requests by job
1055
                                        var powerChanges = new LinkedList<>(
1✔
1056
                                                        getPowerRequests.call(PowerChange::new, bmpId));
1✔
1057
                                        while (!powerChanges.isEmpty()) {
1✔
1058
                                                var change = powerChanges.poll();
1✔
1059
                                                var jobChanges = new ArrayList<>(List.of(change));
1✔
1060
                                                while (!powerChanges.isEmpty()
1✔
1061
                                                                && change.isSameJob(powerChanges.peek())) {
×
1062
                                                        jobChanges.add(powerChanges.poll());
×
1063
                                                }
1064
                                                if (!jobChanges.isEmpty()) {
1✔
1065
                                                        log.debug("Running job changes {}", jobChanges);
1✔
1066
                                                        requests.add(new PowerRequest(bmpId, change.jobId(),
1✔
1067
                                                                        change.from(), change.to(), jobChanges));
1✔
1068
                                                }
1069
                                        }
1✔
1070

1071
                                        // Leave these until quiet
1072
                                        if (requests.isEmpty()) {
1✔
1073
                                                requests.addAll(getBlacklistReads.call(
1✔
1074
                                                                row -> new BoardRequest(bmpId, READ_BL, row),
1✔
1075
                                                                bmpId));
1✔
1076
                                        }
1077
                                        if (requests.isEmpty()) {
1✔
1078
                                                requests.addAll(getBlacklistWrites.call(
1✔
1079
                                                                row -> new BoardRequest(bmpId, WRITE_BL, row),
1✔
1080
                                                                bmpId));
1✔
1081
                                                requests.addAll(getReadSerialInfos.call(
1✔
1082
                                                                row -> new BoardRequest(bmpId, GET_SERIAL, row),
1✔
1083
                                                                bmpId));
1✔
1084
                                                requests.addAll(getReadTemps.call(
1✔
1085
                                                                row -> new BoardRequest(bmpId, READ_TEMP, row),
×
1086
                                                                bmpId));
1✔
1087
                                        }
1088
                                });
1✔
1089
                        } catch (Exception e) {
×
1090
                                log.error("unhandled exception for BMP '{}'", bmpId, e);
×
1091
                        }
1✔
1092
                        return requests;
1✔
1093
                }
1094
        }
1095

1096
        /**
1097
         * The testing interface.
1098
         *
1099
         * @hidden
1100
         */
1101
        @ForTestingOnly
1102
        public interface TestAPI {
1103
                /**
1104
                 * Ensure things are set up after a database change that updates the
1105
                 * BMPs in the system.
1106
                 */
1107
                void prepare();
1108

1109
                /**
1110
                 * The core of the scheduler.
1111
                 *
1112
                 * @param millis
1113
                 *            How many milliseconds to sleep before doing a rerun of the
1114
                 *            scheduler. If zero (or less), only one run will be done.
1115
                 * @param bmps
1116
                 *            The BMPs to be updated.
1117
                 * @throws IOException
1118
                 *             If talking to the network fails
1119
                 * @throws SpinnmanException
1120
                 *             If a BMP sends an error back
1121
                 * @throws InterruptedException
1122
                 *             If the wait for workers to spawn fails.
1123
                 */
1124
                void processRequests(long millis, Collection<Integer> bmps)
1125
                                throws IOException, SpinnmanException, InterruptedException;
1126

1127
                /**
1128
                 * The core of the scheduler. Will process for all known BMPs.
1129
                 *
1130
                 * @param millis
1131
                 *            How many milliseconds to sleep before doing a rerun of the
1132
                 *            scheduler. If zero (or less), only one run will be done.
1133
                 * @throws IOException
1134
                 *             If talking to the network fails
1135
                 * @throws SpinnmanException
1136
                 *             If a BMP sends an error back
1137
                 * @throws InterruptedException
1138
                 *             If the wait for workers to spawn fails.
1139
                 */
1140
                void processRequests(long millis)
1141
                                throws IOException, SpinnmanException, InterruptedException;
1142

1143
                /**
1144
                 * Get the most recently thrown BMP processing exception.
1145
                 *
1146
                 * @return Current processing exception.
1147
                 */
1148
                Throwable getBmpException();
1149

1150
                /** Clear the current processing exception. */
1151
                void clearBmpException();
1152
        }
1153

1154
        /**
1155
         * @return The test interface.
1156
         * @deprecated This interface is just for testing.
1157
         * @hidden
1158
         */
1159
        @ForTestingOnly
1160
        @RestrictedApi(explanation = "just for testing", link = "index.html",
1161
                        allowedOnPath = ".*/src/test/java/.*")
1162
        @Deprecated
1163
        public final TestAPI getTestAPI() {
1164
                ForTestingOnly.Utils.checkForTestClassOnStack();
1✔
1165
                return new TestAPI() {
1✔
1166
                        @Override
1167
                        public void prepare() {
1168
                                makeWorkers();
1✔
1169
                        }
1✔
1170

1171
                        @Override
1172
                        public void processRequests(long millis, Collection<Integer> bmps)
1173
                                        throws IOException, SpinnmanException,
1174
                                        InterruptedException {
1175
                                /*
1176
                                 * Runs twice because it takes two cycles to fully process a
1177
                                 * request.
1178
                                 */
1179
                                triggerSearch(bmps);
1✔
1180
                                if (millis > 0) {
1✔
1181
                                        Thread.sleep(millis);
1✔
1182
                                        triggerSearch(bmps);
1✔
1183
                                }
1184
                        }
1✔
1185

1186
                        @Override
1187
                        public void processRequests(long millis) throws IOException,
1188
                                        SpinnmanException, InterruptedException {
1189
                                processRequests(millis, workers.keySet());
1✔
1190
                        }
1✔
1191

1192
                        @Override
1193
                        public Throwable getBmpException() {
1194
                                synchronized (BMPController.this) {
1✔
1195
                                        return bmpProcessingException;
1✔
1196
                                }
1197
                        }
1198

1199
                        @Override
1200
                        public void clearBmpException() {
1201
                                synchronized (BMPController.this) {
1✔
1202
                                        bmpProcessingException = null;
1✔
1203
                                }
1✔
1204
                        }
1✔
1205
                };
1206
        }
1207
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc