• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

SpiNNakerManchester / JavaSpiNNaker / 6233274834

19 Sep 2023 08:46AM UTC coverage: 36.409% (-0.6%) from 36.982%
6233274834

Pull #658

github

dkfellows
Merge branch 'master' into java-17
Pull Request #658: Update Java version to 17

1656 of 1656 new or added lines in 260 files covered. (100.0%)

8373 of 22997 relevant lines covered (36.41%)

0.36 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

62.47
/SpiNNaker-allocserv/src/main/java/uk/ac/manchester/spinnaker/alloc/bmp/BMPController.java
1
/*
2
 * Copyright (c) 2021 The University of Manchester
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *     https://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
package uk.ac.manchester.spinnaker.alloc.bmp;
17

18
import static java.lang.String.format;
19
import static java.lang.Thread.currentThread;
20
import static java.lang.Thread.sleep;
21
import static java.time.Instant.now;
22
import static java.util.Objects.requireNonNull;
23
import static org.slf4j.LoggerFactory.getLogger;
24
import static uk.ac.manchester.spinnaker.alloc.bmp.NonBootOperation.GET_SERIAL;
25
import static uk.ac.manchester.spinnaker.alloc.bmp.NonBootOperation.READ_BL;
26
import static uk.ac.manchester.spinnaker.alloc.bmp.NonBootOperation.READ_TEMP;
27
import static uk.ac.manchester.spinnaker.alloc.bmp.NonBootOperation.WRITE_BL;
28
import static uk.ac.manchester.spinnaker.alloc.model.JobState.DESTROYED;
29
import static uk.ac.manchester.spinnaker.alloc.model.JobState.QUEUED;
30
import static uk.ac.manchester.spinnaker.alloc.model.JobState.READY;
31

32
import java.io.IOException;
33
import java.lang.Thread.UncaughtExceptionHandler;
34
import java.util.ArrayList;
35
import java.util.Collection;
36
import java.util.HashMap;
37
import java.util.LinkedList;
38
import java.util.List;
39
import java.util.Map;
40
import java.util.Optional;
41
import java.util.function.Consumer;
42
import java.util.stream.Collectors;
43

44
import javax.annotation.PostConstruct;
45

46
import org.slf4j.Logger;
47
import org.springframework.beans.factory.ObjectProvider;
48
import org.springframework.beans.factory.annotation.Autowired;
49
import org.springframework.jmx.export.annotation.ManagedResource;
50
import org.springframework.scheduling.TaskScheduler;
51
import org.springframework.scheduling.concurrent.ThreadPoolTaskScheduler;
52
import org.springframework.stereotype.Service;
53

54
import com.google.errorprone.annotations.RestrictedApi;
55
import com.google.errorprone.annotations.concurrent.GuardedBy;
56

57
import uk.ac.manchester.spinnaker.alloc.ForTestingOnly;
58
import uk.ac.manchester.spinnaker.alloc.ServiceMasterControl;
59
import uk.ac.manchester.spinnaker.alloc.SpallocProperties.AllocatorProperties;
60
import uk.ac.manchester.spinnaker.alloc.SpallocProperties.TxrxProperties;
61
import uk.ac.manchester.spinnaker.alloc.admin.ReportMailSender;
62
import uk.ac.manchester.spinnaker.alloc.allocator.AllocatorTask;
63
import uk.ac.manchester.spinnaker.alloc.allocator.Epochs;
64
import uk.ac.manchester.spinnaker.alloc.allocator.SpallocAPI;
65
import uk.ac.manchester.spinnaker.alloc.db.DatabaseAPI.Connection;
66
import uk.ac.manchester.spinnaker.alloc.db.DatabaseAwareBean;
67
import uk.ac.manchester.spinnaker.alloc.db.Row;
68
import uk.ac.manchester.spinnaker.alloc.model.Direction;
69
import uk.ac.manchester.spinnaker.alloc.model.JobState;
70
import uk.ac.manchester.spinnaker.machine.board.BMPBoard;
71
import uk.ac.manchester.spinnaker.machine.board.BMPCoords;
72
import uk.ac.manchester.spinnaker.machine.board.HasBMPLocation;
73
import uk.ac.manchester.spinnaker.messages.model.ADCInfo;
74
import uk.ac.manchester.spinnaker.messages.model.Blacklist;
75
import uk.ac.manchester.spinnaker.transceiver.ProcessException;
76
import uk.ac.manchester.spinnaker.transceiver.ProcessException.CallerProcessException;
77
import uk.ac.manchester.spinnaker.transceiver.ProcessException.PermanentProcessException;
78
import uk.ac.manchester.spinnaker.transceiver.ProcessException.TransientProcessException;
79
import uk.ac.manchester.spinnaker.transceiver.SpinnmanException;
80
import uk.ac.manchester.spinnaker.utils.UsedInJavadocOnly;
81

82
/**
83
 * Manages the BMPs of machines controlled by Spalloc.
84
 *
85
 * @author Donal Fellows
86
 */
87
@Service("bmpController")
88
@ManagedResource("Spalloc:type=BMPController,name=bmpController")
89
public class BMPController extends DatabaseAwareBean {
1✔
90
        private static final Logger log = getLogger(BMPController.class);
1✔
91

92
        @Autowired
93
        private SpallocAPI spallocCore;
94

95
        @Autowired
96
        private ServiceMasterControl serviceControl;
97

98
        @Autowired
99
        private Epochs epochs;
100

101
        @Autowired
102
        private TxrxProperties props;
103

104
        @Autowired
105
        private PhysicalSerialMapping phySerMap;
106

107
        @Autowired
108
        private AllocatorProperties allocProps;
109

110
        @Autowired
111
        private ReportMailSender emailSender;
112

113
        @Autowired
114
        private AllocatorTask allocator;
115

116
        private TaskScheduler scheduler;
117

118
        /**
119
         * Map from BMP ID to worker task that handles it.
120
         */
121
        private final Map<Integer, Worker> workers = new HashMap<>();
1✔
122

123
        /**
124
         * Factory for {@linkplain SpiNNakerControl controllers}. Only use via
125
         * {@link #controllerFactory}.
126
         */
127
        @Autowired
128
        private ObjectProvider<SpiNNakerControl> controllerFactoryBean;
129

130
        /**
131
         * Type-safe factory for {@linkplain SpiNNakerControl controllers}.
132
         */
133
        private SpiNNakerControl.Factory controllerFactory;
134

135
        @GuardedBy("this")
136
        private Throwable bmpProcessingException;
137

138
        /**
139
         * An {@link UncaughtExceptionHandler}.
140
         *
141
         * @param thread
142
         *            The thread with the problem.
143
         * @param exception
144
         *            The exception that describes the problem.
145
         */
146
        @UsedInJavadocOnly(UncaughtExceptionHandler.class)
147
        private void handleException(Thread thread, Throwable exception) {
148
                log.error("uncaught exception in BMP worker {}", thread, exception);
×
149
        }
×
150

151
        // ----------------------------------------------------------------
152

153
        @PostConstruct
154
        private void init() {
155
                // Set up scheduler
156
                var sched = new ThreadPoolTaskScheduler();
1✔
157
                scheduler = sched;
1✔
158
                sched.setThreadGroupName("BMP");
1✔
159

160
                controllerFactory = controllerFactoryBean::getObject;
1✔
161
                allocator.setBMPController(this);
1✔
162

163
                // We do the making of workers later in tests
164
                List<Worker> madeWorkers = null;
1✔
165
                if (!serviceControl.isUseDummyBMP()) {
1✔
166
                        madeWorkers = makeWorkers();
×
167
                }
168

169
                // Set the pool size to match the number of workers
170
                if (workers.size() > 1) {
1✔
171
                        sched.setPoolSize(workers.size());
×
172
                }
173

174
                // Launch the scheduler now it is all set up
175
                sched.initialize();
1✔
176

177
                // And now use the scheduler
178
                if (madeWorkers != null) {
1✔
179
                        for (var worker : madeWorkers) {
×
180
                                scheduler.scheduleAtFixedRate(worker, allocProps.getPeriod());
×
181
                        }
×
182
                }
183
        }
1✔
184

185
        private List<Worker> makeWorkers() {
186
                // Make workers
187
                try (var c = getConnection();
1✔
188
                                var getBmps = c.query(GET_ALL_BMPS);
1✔
189
                                var getBoards = c.query(GET_ALL_BMP_BOARDS)) {
1✔
190
                        return c.transaction(false, () -> getBmps.call(row -> {
1✔
191
                                var m = spallocCore.getMachine(row.getString("machine_name"),
1✔
192
                                                true);
193
                                var coords = new BMPCoords(row.getInt("cabinet"),
1✔
194
                                                row.getInt("frame"));
1✔
195
                                var boards = new HashMap<BMPBoard, String>();
1✔
196
                                var bmpId = row.getInt("bmp_id");
1✔
197
                                getBoards.call(r -> {
1✔
198
                                        boards.put(new BMPBoard(r.getInt("board_num")),
1✔
199
                                                        r.getString("address"));
1✔
200
                                        return null;
1✔
201
                                }, bmpId);
1✔
202
                                var control = controllerFactory.create(m.get(), coords, boards);
1✔
203
                                var worker = new Worker(control, bmpId);
1✔
204
                                workers.put(row.getInt("bmp_id"), worker);
1✔
205
                                return worker;
1✔
206
                        }));
207
                }
208
        }
209

210
        /**
211
         * Trigger the execution of the workers for the given BMPs now.
212
         *
213
         * @param bmps
214
         *            A list of BMPs that have changed.
215
         */
216
        public void triggerSearch(Collection<Integer> bmps) {
217
                for (var b : bmps) {
1✔
218
                        var worker = workers.get(b);
1✔
219
                        if (worker != null) {
1✔
220
                                scheduler.schedule(worker::run, now());
1✔
221
                        } else {
222
                                log.error("Could not find worker for BMP {}", b);
1✔
223
                        }
224
                }
1✔
225
        }
1✔
226

227
        /** An action that may throw any of a range of exceptions. */
228
        private interface ThrowingAction {
229
                void act() throws ProcessException, IOException, InterruptedException;
230
        }
231

232
        private abstract sealed class Request
233
                        permits BoardRequest, PowerRequest {
234
                final int bmpId;
235

236
                private int numTries = 0;
1✔
237

238
                Request(int bmpId) {
1✔
239
                        this.bmpId = bmpId;
1✔
240
                }
1✔
241

242
                /**
243
                 * @return Whether this request may be repeated.
244
                 */
245
                boolean isRepeat() {
246
                        return numTries < props.getPowerAttempts();
1✔
247
                }
248

249
                /**
250
                 * Basic machinery for handling exceptions that arise while performing a
251
                 * BMP action. Runs on a thread that may touch a BMP directly, but which
252
                 * may not touch the database.
253
                 * <p>
254
                 * Only subclasses should use this!
255
                 *
256
                 * @param body
257
                 *            What to attempt.
258
                 * @param onFailure
259
                 *            What to do on failure.
260
                 * @param onServiceRemove
261
                 *            If the exception looks serious, call this to trigger a
262
                 *            board being taken out of service.
263
                 * @return Whether to stop the retry loop.
264
                 * @throws InterruptedException
265
                 *             If interrupted.
266
                 */
267
                final boolean bmpAction(ThrowingAction body,
268
                                Consumer<Exception> onFailure,
269
                                Consumer<PermanentProcessException> onServiceRemove)
270
                                throws InterruptedException {
271
                        boolean isLastTry = numTries++ >= props.getPowerAttempts();
1✔
272
                        Exception exn;
273
                        try {
274
                                body.act();
1✔
275
                                // Exit the retry loop (up the stack); the requests all worked
276
                                return true;
1✔
277
                        } catch (InterruptedException e) {
×
278
                                /*
279
                                 * We were interrupted! This happens when we're shutting down.
280
                                 * Log (because we're in an inconsistent state) and rethrow so
281
                                 * that the outside gets to clean up.
282
                                 */
283
                                log.error("Requests failed on BMP {} because of "
×
284
                                                + "interruption", bmpId, e);
×
285
                                currentThread().interrupt();
×
286
                                throw e;
×
287
                        } catch (TransientProcessException e) {
×
288
                                if (!isLastTry) {
×
289
                                        // Log somewhat gently; we *might* be able to recover...
290
                                        log.warn("Retrying requests on BMP {} after {}: {}",
×
291
                                                        bmpId, props.getProbeInterval(),
×
292
                                                        e.getMessage());
×
293
                                        // Ask for a retry
294
                                        return false;
×
295
                                }
296
                                exn = e;
×
297
                                log.error("Requests failed on BMP {}", bmpId, e);
×
298
                        } catch (PermanentProcessException e) {
×
299
                                log.error("BMP {} on {} is unreachable", e.source, bmpId, e);
×
300
                                onServiceRemove.accept(e);
×
301
                                exn = e;
×
302
                        } catch (CallerProcessException e) {
×
303
                                // This is probably a software bug
304
                                log.error("SW bug talking to BMP {}", bmpId, e);
×
305
                                exn = e;
×
306
                        } catch (ProcessException | IOException | RuntimeException e) {
×
307
                                log.error("Requests failed on BMP {}", bmpId, e);
×
308
                                exn = e;
×
309
                        }
×
310
                        /*
311
                         * Common permanent failure handling case; arrange for taking a
312
                         * board out of service, mark a request as failed, and stop the
313
                         * retry loop.
314
                         */
315
                        onFailure.accept(exn);
×
316
                        return true;
×
317
                }
318

319
                /**
320
                 * Add a report to the database of a problem with a board.
321
                 *
322
                 * @param sql
323
                 *            How to talk to the DB
324
                 * @param boardId
325
                 *            Which board has the problem
326
                 * @param jobId
327
                 *            What job was associated with the problem (if any)
328
                 * @param msg
329
                 *            Information about what the problem was
330
                 */
331
                final void addBoardReport(Connection c, int boardId, Integer jobId,
332
                                String msg) {
333
                        try (var getUser = c.query(GET_USER_DETAILS_BY_NAME);
×
334
                                        var insertBoardReport = c.update(INSERT_BOARD_REPORT)) {
×
335
                                getUser.call1(row -> row.getInt("user_id"),
×
336
                                                allocProps.getSystemReportUser()).ifPresent(
×
337
                                                                userId -> insertBoardReport.call(
×
338
                                                                                boardId, jobId,        msg, userId));
×
339
                        }
340
                }
×
341

342
                /**
343
                 * Marks a board as actually dead, and requests we send email about it.
344
                 *
345
                 * @param sql
346
                 *            How to talk to the DB
347
                 * @param boardId
348
                 *            Which board has the problem
349
                 * @param msg
350
                 *            Information about what the problem was
351
                 * @return Whether we've successfully done a change.
352
                 */
353
                final void markBoardAsDead(Connection c, int boardId, String msg) {
354
                        try (var setFunctioning = c.update(SET_FUNCTIONING_FIELD);
×
355
                                        var findBoardById = c.query(FIND_BOARD_BY_ID)) {
×
356
                                boolean result = setFunctioning.call(false, boardId) > 0;
×
357
                                if (result) {
×
358
                                        findBoardById.call1(row -> {
×
359
                                                var ser = row.getString("physical_serial_id");
×
360
                                                if (ser == null) {
×
361
                                                        ser = "<UNKNOWN>";
×
362
                                                }
363
                                                var fullMessage = format(
×
364
                                                                "Marked board at %d,%d,%d of %s (serial: %s) "
365
                                                                                + "as dead: %s",
366
                                                                row.getInt("x"), row.getInt("y"),
×
367
                                                                row.getInt("z"), row.getString("machine_name"),
×
368
                                                                ser, msg);
369
                                                emailSender.sendServiceMail(fullMessage);
×
370
                                                return null;
×
371
                                        }, boardId);
×
372
                                }
373
                        }
374
                }
×
375

376
                boolean processRequest(SpiNNakerControl control) {
377
                        while (isRepeat()) {
1✔
378
                                try {
379
                                        if (tryProcessRequest(control)) {
1✔
380
                                                return true;
1✔
381
                                        }
382
                                        sleep(props.getProbeInterval().toMillis());
×
383
                                } catch (InterruptedException e) {
×
384
                                        // If this happens, just cancel the transaction;
385
                                        // when we come back, all things will be redone.
386
                                        throw new RuntimeException(e);
×
387
                                }
×
388
                        }
389
                        return false;
×
390
                }
391

392
                abstract boolean tryProcessRequest(SpiNNakerControl control)
393
                                throws InterruptedException;
394
        }
395

396
        /**
397
         * Describes a request to modify the power status of a collection of boards.
398
         * The boards must be on a single machine and must all be assigned to a
399
         * single job.
400
         * <p>
401
         * This is the message that is sent from the main thread to the per-BMP
402
         * worker threads.
403
         *
404
         * @author Donal Fellows
405
         */
406
        private final class PowerRequest extends Request {
407
                private final List<BMPBoard> powerOnBoards = new ArrayList<>();
1✔
408

409
                private final List<BMPBoard> powerOffBoards = new ArrayList<>();
1✔
410

411
                private final List<Link> linkRequests = new ArrayList<>();
1✔
412

413
                private final int jobId;
414

415
                private final JobState from;
416

417
                private final JobState to;
418

419
                private final List<Integer> changeIds = new ArrayList<>();
1✔
420

421
                private final Map<Integer, Integer> boardToId = new HashMap<>();
1✔
422

423
                /**
424
                 * Create a request.
425
                 *
426
                 * @param sql
427
                 *            How to access the database.
428
                 * @param machine
429
                 *            What machine are the boards on? <em>Must not</em> be
430
                 *            {@code null}.
431
                 * @param powerOn
432
                 *            What boards (by DB ID) are to be powered on? May be
433
                 *            {@code null}; that's equivalent to the empty list.
434
                 * @param powerOff
435
                 *            What boards (by DB ID) are to be powered off? May be
436
                 *            {@code null}; that's equivalent to the empty list.
437
                 * @param links
438
                 *            Any link power control requests. By default, links are on
439
                 *            if their board is on and they are connected; it is
440
                 *            <em>useful and relevant</em> to modify the power state of
441
                 *            links on the periphery of an allocation. May be
442
                 *            {@code null}; that's equivalent to the empty list.
443
                 * @param jobId
444
                 *            For what job is this?
445
                 * @param from
446
                 *            What state is the job moving from?
447
                 * @param to
448
                 *            What state is the job moving to?
449
                 * @param changeIds
450
                 *            The DB ids that describe the change, so we can update
451
                 *            those records.
452
                 * @param idToBoard
453
                 *            How to get the physical ID of a board from its database ID
454
                 */
455
                PowerRequest(int bmpId, int jobId, JobState from, JobState to,
456
                                List<PowerChange> powerChanges) {
1✔
457
                        super(bmpId);
1✔
458
                        for (var change : powerChanges) {
1✔
459
                                if (change.power()) {
1✔
460
                                        powerOnBoards.add(new BMPBoard(change.boardNum()));
1✔
461
                                } else {
462
                                        powerOffBoards.add(new BMPBoard(change.boardNum()));
1✔
463
                                }
464
                                change.offLinks().stream().forEach(link ->
1✔
465
                                                linkRequests.add(new Link(change.boardNum(), link)));
1✔
466
                                changeIds.add(change.changeId());
1✔
467
                                boardToId.put(change.boardNum(), change.boardId);
1✔
468
                        }
1✔
469
                        this.jobId = jobId;
1✔
470
                        this.from = from;
1✔
471
                        this.to = to;
1✔
472
                }
1✔
473

474
                /**
475
                 * Change the power state of boards in this request.
476
                 *
477
                 * @param controllers
478
                 *            How to actually communicate with the machine
479
                 * @throws ProcessException
480
                 *             If the transceiver chokes
481
                 * @throws InterruptedException
482
                 *             If interrupted
483
                 * @throws IOException
484
                 *             If network I/O fails
485
                 */
486
                void changeBoardPowerState(SpiNNakerControl controller)
487
                                throws ProcessException, InterruptedException, IOException {
488

489
                        // Send any power on commands
490
                        if (!powerOnBoards.isEmpty()) {
1✔
491
                                controller.powerOnAndCheck(powerOnBoards);
1✔
492
                        }
493

494
                        // Process perimeter link requests next
495
                        for (var linkReq : linkRequests) {
1✔
496
                                // Set the link state, as required
497
                                controller.setLinkOff(linkReq);
1✔
498
                        }
1✔
499

500
                        // Finally send any power off commands
501
                        if (!powerOffBoards.isEmpty()) {
1✔
502
                                controller.powerOff(powerOffBoards);
1✔
503
                        }
504
                }
1✔
505

506
                /**
507
                 * Handles the database changes after a set of changes to a BMP complete
508
                 * successfully. We will move the job to the state it supposed to be in.
509
                 *
510
                 * @param sql
511
                 *            How to access the DB
512
                 * @return Whether the state of boards or jobs has changed.
513
                 */
514
                private void done() {
515
                        try (var c = getConnection();
1✔
516
                                        var deallocateBoards = c.update(DEALLOCATE_BMP_BOARDS_JOB);
1✔
517
                                        var deleteChange = c.update(FINISHED_PENDING);
1✔
518
                                        var setBoardPowerOn = c.update(SET_BOARD_POWER_ON);
1✔
519
                                        var setBoardPowerOff = c.update(SET_BOARD_POWER_OFF)) {
1✔
520
                                c.transaction(() -> {
1✔
521
                                        int turnedOn = powerOnBoards.stream().map(this::getBoardId)
1✔
522
                                                        .mapToInt(setBoardPowerOn::call).sum();
1✔
523
                                        int turnedOff =
1✔
524
                                                        powerOffBoards.stream().map(this::getBoardId)
1✔
525
                                                                        .mapToInt(setBoardPowerOff::call).sum();
1✔
526

527
                                        if (to == DESTROYED || to == QUEUED) {
1✔
528
                                                /*
529
                                                 * Need to mark the boards as not allocated; can't do
530
                                                 * that until they've been switched off.
531
                                                 */
532
                                                deallocateBoards.call(jobId, bmpId);
1✔
533
                                        }
534
                                        int completed = changeIds.stream().mapToInt(
1✔
535
                                                        deleteChange::call).sum();
1✔
536

537
                                        log.debug("BMP ACTION SUCCEEDED ({}:{}->{}): on:{} off:{} "
1✔
538
                                                        + "completed: {}",
539
                                                        jobId, from, to, turnedOn, turnedOff, completed);
1✔
540
                                });
1✔
541
                        }
542

543
                        // Tell the allocator something has happened
544
                        allocator.updateJob(jobId, from, to);
1✔
545
                }
1✔
546

547
                /**
548
                 * Handles the database changes after a set of changes to a BMP complete
549
                 * with a failure. We will roll back the job state to what it was
550
                 * before.
551
                 *
552
                 * @param sql
553
                 *            How to access the DB
554
                 * @return Whether the state of boards or jobs has changed.
555
                 */
556
                private void failed() {
557
                        var resetJobAlloc = false;
×
558
                        try (var c = getConnection();
×
559
                                        var deallocateBoards = c.update(DEALLOCATE_BMP_BOARDS_JOB);
×
560
                                        var deleteChange = c.update(FINISHED_PENDING);
×
561
                                        var setBoardPowerOff = c.update(SET_BOARD_POWER_OFF)) {
×
562
                                resetJobAlloc = c.transaction(() -> {
×
563
                                        // We should mark the boards as off
564
                                        int turnedOff =
×
565
                                                        powerOffBoards.stream().map(this::getBoardId)
×
566
                                                                        .mapToInt(setBoardPowerOff::call).sum();
×
567

568
                                        // ... even those that we should be powering on ...
569
                                        turnedOff +=
×
570
                                                        powerOnBoards.stream().map(this::getBoardId)
×
571
                                                                        .mapToInt(setBoardPowerOff::call).sum();
×
572

573
                                        // Deallocate the boards on this bmp from the job;
574
                                        // other boards can be deallocated elsewhere.
575
                                        deallocateBoards.call(jobId, bmpId);
×
576

577
                                        // Delete change ids as they are done even if failed.
578
                                        var completed = changeIds.stream().mapToInt(
×
579
                                                        deleteChange::call).sum();
×
580

581
                                        log.debug(
×
582
                                                        "BMP ACTION FAILED on {} ({}:{}->{}) off:{} "
583
                                                        + "completed:{}",
584
                                                        bmpId, jobId, from, to, turnedOff, completed);
×
585

586
                                        // If we were meant to be powering up, reset the allocation
587
                                        // once done here.
588
                                        return (to == READY && powerOffBoards.isEmpty());
×
589
                                });
590
                        }
591
                        if (resetJobAlloc) {
×
592
                                allocator.resetPowerOnFailure(jobId);
×
593
                        }
594
                }
×
595

596
                /**
597
                 * Process an action to power on or off a set of boards. Runs on a
598
                 * thread that may touch a BMP directly, but which may not touch the
599
                 * database.
600
                 *
601
                 * @param controller
602
                 *            How to actually reach the BMPs.
603
                 * @return Whether this action has "succeeded" and shouldn't be retried.
604
                 * @throws InterruptedException
605
                 *             If interrupted.
606
                 */
607
                @Override
608
                boolean tryProcessRequest(SpiNNakerControl controller)
609
                                throws InterruptedException {
610
                        boolean ok = bmpAction(() -> {
1✔
611
                                changeBoardPowerState(controller);
1✔
612
                                // We want to ensure the lead board is alive
613
                                if (!serviceControl.isUseDummyBMP()) {
1✔
614
                                        // Don't bother with pings when the dummy is enabled
615
                                        controller.ping(powerOnBoards);
×
616
                                }
617
                                done();
1✔
618
                        }, e -> {
1✔
619
                                failed();
×
620
                                synchronized (BMPController.this) {
×
621
                                        bmpProcessingException = e;
×
622
                                }
×
623
                        }, ppe -> {
×
624
                                badBoard(ppe);
×
625
                        });
×
626
                        return ok;
1✔
627
                }
628

629
                @Override
630
                public String toString() {
631
                        var sb = new StringBuilder("PowerRequest(for=")
×
632
                                        .append(bmpId);
×
633
                        sb.append(";on=").append(powerOnBoards);
×
634
                        sb.append(",off=").append(powerOffBoards);
×
635
                        sb.append(",links=").append(linkRequests);
×
636
                        return sb.append(")").toString();
×
637
                }
638

639
                private static final String REPORT_MSG =
640
                                "board was not reachable when trying to power it: ";
641

642
                /**
643
                 * When a BMP is unroutable, we must tell the alloc engine to pick
644
                 * somewhere else, and we should mark the board as out of service too;
645
                 * it's never going to work so taking it out right away is the only sane
646
                 * plan. We also need to nuke the planned changes. Retrying is bad.
647
                 *
648
                 * @param failure
649
                 *            The failure message.
650
                 * @return Whether the state of boards or jobs has changed.
651
                 */
652
                private void badBoard(ProcessException failure) {
653
                        try (var c = getConnection()) {
×
654
                                c.transaction(() -> {
×
655
                                        getBoardId(failure.source).ifPresent(boardId -> {
×
656
                                                // Mark the board as dead right now
657
                                                markBoardAsDead(c, boardId, REPORT_MSG + failure);
×
658
                                                // Add a report if we can
659
                                                addBoardReport(c, boardId, jobId, REPORT_MSG + failure);
×
660
                                        });
×
661
                                });
×
662
                        }
663
                }
×
664

665
                /**
666
                 * Given a board address, get the ID that it corresponds to. Reverses
667
                 * {@link #idToBoard}.
668
                 *
669
                 * @param addr
670
                 *            The board address.
671
                 * @return The ID, if one can be found.
672
                 */
673
                private Optional<Integer> getBoardId(HasBMPLocation addr) {
674
                        return Optional.ofNullable(boardToId.get(addr.getBoard()));
×
675
                }
676

677
                private Integer getBoardId(BMPBoard board) {
678
                        return boardToId.get(board.board());
1✔
679
                }
680
        }
681

682
        /**
683
         * A request to read or write information on a BMP. Includes blacklists,
684
         * serial numbers, temperature data, etc.
685
         *
686
         * @author Donal Fellows
687
         */
688
        private final class BoardRequest extends Request {
689
                private final NonBootOperation op;
690

691
                private final int opId;
692

693
                private final int boardId;
694

695
                private final BMPCoords bmp;
696

697
                private final BMPBoard board;
698

699
                private final String bmpSerialId;
700

701
                private final Blacklist blacklist;
702

703
                private final int machineId;
704

705
                private BoardRequest(int bmpId, NonBootOperation op, Row row) {
1✔
706
                        super(bmpId);
1✔
707
                        this.op = op;
1✔
708
                        opId = row.getInt("op_id");
1✔
709
                        boardId = row.getInt("board_id");
1✔
710
                        bmp = new BMPCoords(row.getInt("cabinet"), row.getInt("frame"));
1✔
711
                        board = new BMPBoard(row.getInt("board_num"));
1✔
712
                        if (op == WRITE_BL) {
1✔
713
                                blacklist = row.getSerial("data", Blacklist.class);
1✔
714
                        } else {
715
                                blacklist = null;
1✔
716
                        }
717
                        bmpSerialId = row.getString("bmp_serial_id");
1✔
718
                        machineId = row.getInt("machine_id");
1✔
719
                }
1✔
720

721
                /** The serial number actually read from the board. */
722
                private String readSerial;
723

724
                /**
725
                 * Access the DB to store the serial number information that we
726
                 * retrieved. A transaction should already be held.
727
                 *
728
                 * @param c
729
                 *            How to access the DB
730
                 * @return Whether we've changed anything
731
                 */
732
                private void recordSerialIds(Connection c) {
733
                        try (var setBoardSerialIds = c.update(SET_BOARD_SERIAL_IDS)) {
1✔
734
                                setBoardSerialIds.call(boardId, readSerial,
1✔
735
                                                phySerMap.getPhysicalId(readSerial));
1✔
736
                        }
737
                }
1✔
738

739
                /**
740
                 * Access the DB to mark the read request as successful and store the
741
                 * blacklist that was read. A transaction should already be held.
742
                 *
743
                 * @param c
744
                 *            How to access the DB
745
                 * @param readBlacklist
746
                 *            The blacklist that was read
747
                 * @return Whether we've changed anything
748
                 */
749
                private void doneReadBlacklist(Connection c, Blacklist readBlacklist) {
750
                        try (var completed = c.update(COMPLETED_BOARD_INFO_READ)) {
1✔
751
                                log.debug("Completing blacklist read opId {}", opId);
1✔
752
                                completed.call(readBlacklist, opId);
1✔
753
                        }
754
                }
1✔
755

756
                /**
757
                 * Access the DB to mark the write request as successful. A transaction
758
                 * should already be held.
759
                 *
760
                 * @param c
761
                 *            How to access the DB
762
                 * @return Whether we've changed anything
763
                 */
764
                private void doneWriteBlacklist(Connection c) {
765
                        try (var completed = c.update(COMPLETED_BLACKLIST_WRITE)) {
1✔
766
                                completed.call(opId);
1✔
767
                        }
768
                }
1✔
769

770
                /**
771
                 * Access the DB to mark the read request as successful; the actual
772
                 * store of the serial data is elsewhere
773
                 * ({@link #recordSerialIds(Connection)}). A transaction should already
774
                 * be held.
775
                 *
776
                 * @param c
777
                 *            How to access the DB
778
                 * @return Whether we've changed anything
779
                 */
780
                private void doneReadSerial(Connection c) {
781
                        try (var completed = c.update(COMPLETED_GET_SERIAL_REQ)) {
1✔
782
                                completed.call(opId);
1✔
783
                        }
784
                }
1✔
785

786
                /**
787
                 * Access the DB to mark the read request as successful and store the
788
                 * ADC info that was read. A transaction should be held.
789
                 *
790
                 * @param c
791
                 *            The database connection.
792
                 */
793
                private void doneReadTemps(Connection c, ADCInfo adcInfo) {
794
                        try (var completed = c.update(COMPLETED_BOARD_INFO_READ)) {
×
795
                                log.debug("Completing temperature read opId {}", opId);
×
796
                                completed.call(adcInfo, opId);
×
797
                        }
798
                }
×
799

800
                /**
801
                 * Access the DB to mark the request as failed and store the exception.
802
                 *
803
                 * @param exn
804
                 *            The exception that caused the failure.
805
                 * @return Whether we've changed anything
806
                 */
807
                private void failed(Exception exn) {
808
                        try (var c = getConnection();
×
809
                                        var failed = c.update(FAILED_BLACKLIST_OP)) {
×
810
                                c.transaction(() -> failed.call(exn, opId));
×
811
                        }
812
                }
×
813

814
                private static final String REPORT_MSG =
815
                                "board was not reachable when trying to access its blacklist: ";
816

817
                /**
818
                 * Access the DB to mark a board as out of service.
819
                 *
820
                 * @param exn
821
                 *            The exception that caused the failure.
822
                 * @return Whether we've changed anything
823
                 */
824
                void takeOutOfService(Exception exn) {
825
                        try (var c = getConnection()) {
×
826
                                c.transaction(() -> {
×
827
                                        addBoardReport(c, boardId, null, REPORT_MSG + exn);
×
828
                                        markBoardAsDead(c, boardId, REPORT_MSG + exn);
×
829
                                });
×
830
                        }
831
                }
×
832

833
                /**
834
                 * Process an action to work with a blacklist or serial number. Runs on
835
                 * a thread that may touch a BMP directly, but which may not touch the
836
                 * database.
837
                 *
838
                 * @param controller
839
                 *            How to actually reach the BMP.
840
                 * @return Whether this action has "succeeded" and shouldn't be retried.
841
                 * @throws InterruptedException
842
                 *             If interrupted.
843
                 */
844
                @Override
845
                boolean tryProcessRequest(SpiNNakerControl controller)
846
                                throws InterruptedException {
847
                        return bmpAction(() -> {
1✔
848
                                switch (op) {
1✔
849
                                case WRITE_BL -> writeBlacklist(controller);
1✔
850
                                case READ_BL -> readBlacklist(controller);
1✔
851
                                case GET_SERIAL -> readSerial(controller);
1✔
852
                                case READ_TEMP ->  readTemps(controller);
×
853
                                default -> throw new IllegalArgumentException();
×
854
                                }
855
                                epochs.blacklistChanged(boardId);
1✔
856
                                epochs.machineChanged(machineId);
1✔
857
                        }, e -> {
1✔
858
                                failed(e);
×
859
                        }, ppe -> {
×
860
                                takeOutOfService(ppe);
×
861
                        });
×
862
                }
863

864
                /**
865
                 * Process an action to read a blacklist.
866
                 *
867
                 * @param controller
868
                 *            How to actually reach the BMP.
869
                 * @throws InterruptedException
870
                 *             If interrupted.
871
                 * @throws IOException
872
                 *             If the network is unhappy.
873
                 * @throws ProcessException
874
                 *             If the BMP rejects a message.
875
                 */
876
                private void readBlacklist(SpiNNakerControl controller)
877
                                throws InterruptedException, ProcessException, IOException {
878
                        readSerial = controller.readSerial(board);
1✔
879
                        if (bmpSerialId != null && !bmpSerialId.equals(readSerial)) {
1✔
880
                                /*
881
                                 * Doesn't match; WARN but keep going; hardware may just be
882
                                 * remapped behind our back.
883
                                 */
884
                                log.warn(
×
885
                                                "blacklist read mismatch: expected serial ID '{}' "
886
                                                                + "not equal to actual serial ID '{}'",
887
                                                bmpSerialId, readSerial);
888
                        }
889
                        var readBlacklist = controller.readBlacklist(board);
1✔
890
                        try (var c = getConnection()) {
1✔
891
                                c.transaction(() -> {
1✔
892
                                        recordSerialIds(c);
1✔
893
                                        doneReadBlacklist(c, readBlacklist);
1✔
894
                                });
1✔
895
                        }
896
                }
1✔
897

898
                /**
899
                 * Process an action to write a blacklist.
900
                 *
901
                 * @param controller
902
                 *            How to actually reach the BMP.
903
                 * @throws InterruptedException
904
                 *             If interrupted.
905
                 * @throws IOException
906
                 *             If the network is unhappy.
907
                 * @throws ProcessException
908
                 *             If the BMP rejects a message.
909
                 * @throws IllegalStateException
910
                 *             If the operation is applied to a board other than the one
911
                 *             that it is expected to apply to.
912
                 */
913
                private void writeBlacklist(SpiNNakerControl controller)
914
                                throws InterruptedException, ProcessException, IOException {
915
                        readSerial = controller.readSerial(board);
1✔
916
                        if (bmpSerialId != null && !bmpSerialId.equals(readSerial)) {
1✔
917
                                // Doesn't match, so REALLY unsafe to keep going!
918
                                throw new IllegalStateException(format(
×
919
                                                "aborting blacklist write: expected serial ID '%s' "
920
                                                                + "not equal to actual serial ID '%s'",
921
                                                bmpSerialId, readSerial));
922
                        }
923
                        controller.writeBlacklist(board, requireNonNull(blacklist));
1✔
924
                        try (var c = getConnection()) {
1✔
925
                                c.transaction(() -> doneWriteBlacklist(c));
1✔
926
                        }
927
                }
1✔
928

929
                /**
930
                 * Process an action to read the serial number from a BMP.
931
                 *
932
                 * @param controller
933
                 *            How to actually reach the BMP.
934
                 * @throws InterruptedException
935
                 *             If interrupted.
936
                 * @throws IOException
937
                 *             If the network is unhappy
938
                 * @throws ProcessException
939
                 *             If the BMP rejects a message.
940
                 */
941
                private void readSerial(SpiNNakerControl controller)
942
                                throws InterruptedException, ProcessException, IOException {
943
                        readSerial = controller.readSerial(board);
1✔
944
                        try (var c = getConnection()) {
1✔
945
                                c.transaction(() -> {
1✔
946
                                        recordSerialIds(c);
1✔
947
                                        doneReadSerial(c);
1✔
948
                                });
1✔
949
                        }
950
                }
1✔
951

952
                /**
953
                 * Process an action to read some temperature data.
954
                 *
955
                 * @param controller
956
                 *            How to actually reach the BMP.
957
                 * @throws InterruptedException
958
                 *             If interrupted.
959
                 * @throws IOException
960
                 *             If the network is unhappy.
961
                 * @throws ProcessException
962
                 *             If the BMP rejects a message.
963
                 */
964
                private void readTemps(SpiNNakerControl controller)
965
                                throws InterruptedException, ProcessException, IOException {
966
                        var adcInfo = controller.readTemp(board);
×
967
                        try (var c = getConnection()) {
×
968
                                c.transaction(() -> doneReadTemps(c, adcInfo));
×
969
                        }
970
                }
×
971

972
                @Override
973
                public String toString() {
974
                        var sb = new StringBuilder("BoardRequest(for ");
×
975
                        sb.append("bmp=").append(bmp);
×
976
                        sb.append(",board=").append(boardId);
×
977
                        sb.append(",op=").append(op);
×
978
                        return sb.append(")").toString();
×
979
                }
980
        }
981

982
        private record PowerChange(Integer changeId, int jobId, Integer boardId,
1✔
983
                        Integer boardNum, boolean power, JobState from, JobState to,
984
                        List<Direction> offLinks) {
985
                PowerChange(Row row) {
986
                        this(row.getInteger("change_id"), //
1✔
987
                                        row.getInt("job_id"), //
1✔
988
                                        row.getInteger("board_id"), //
1✔
989
                                        row.getInteger("board_num"), //
1✔
990
                                        row.getBoolean("power"),
1✔
991
                                        row.getEnum("from_state", JobState.class),
1✔
992
                                        row.getEnum("to_state", JobState.class),
1✔
993
                                        List.of(Direction.values()).stream()
1✔
994
                                                        .filter(link -> !row.getBoolean(link.columnName))
1✔
995
                                                        .collect(Collectors.toList()));
1✔
996
                }
1✔
997

998
                boolean isSameJob(PowerChange p) {
999
                        return p.jobId == jobId && p.from == from && p.to == to;
×
1000
                }
1001
        }
1002

1003
        // ----------------------------------------------------------------
1004
        // WORKER IMPLEMENTATION
1005

1006
        /** A worker of a given BMP. */
1007
        private final class Worker implements Runnable {
1008
                /** What are we controlling? */
1009
                private final SpiNNakerControl control;
1010

1011
                /** Which boards are we looking at? */
1012
                private final int bmpId;
1013

1014
                Worker(SpiNNakerControl control, int bmpId) {
1✔
1015
                        this.control = control;
1✔
1016
                        this.bmpId = bmpId;
1✔
1017

1018
                        log.debug("Created worker for boards {}", bmpId);
1✔
1019
                }
1✔
1020

1021
                /**
1022
                 * Periodically call to update, or trigger externally.
1023
                 */
1024
                @Override
1025
                public synchronized void run() {
1026
                        log.trace("Searching for changes on BMP {}", bmpId);
1✔
1027

1028
                        try {
1029
                                var changes = getRequestedOperations();
1✔
1030
                                for (var change : changes) {
1✔
1031
                                        change.processRequest(control);
1✔
1032
                                }
1✔
1033
                        } catch (Exception e) {
×
1034
                                log.error("unhandled exception for BMP '{}'", bmpId, e);
×
1035
                        }
1✔
1036
                }
1✔
1037

1038
                /**
1039
                 * Get the things that we want the worker to do. <em>Be very
1040
                 * careful!</em> Because this necessarily involves the database, this
1041
                 * must not touch the BMP handle as those operations take a long time
1042
                 * and we absolutely must not have a transaction open at the same time.
1043
                 *
1044
                 * @return List of operations to perform.
1045
                 */
1046
                private List<Request> getRequestedOperations() {
1047
                        var requests = new ArrayList<Request>();
1✔
1048
                        try (var c = getConnection();
1✔
1049
                                        var getPowerRequests = c.query(GET_CHANGES);
1✔
1050
                                        var getBlacklistReads = c.query(GET_BLACKLIST_READS);
1✔
1051
                                        var getBlacklistWrites = c.query(GET_BLACKLIST_WRITES);
1✔
1052
                                        var getReadSerialInfos = c.query(GET_SERIAL_INFO_REQS);
1✔
1053
                                        var getReadTemps = c.query(GET_TEMP_INFO_REQS)) {
1✔
1054
                                c.transaction(false, () -> {
1✔
1055
                                        // Batch power requests by job
1056
                                        var powerChanges = new LinkedList<>(
1✔
1057
                                                        getPowerRequests.call(PowerChange::new, bmpId));
1✔
1058
                                        while (!powerChanges.isEmpty()) {
1✔
1059
                                                var change = powerChanges.poll();
1✔
1060
                                                var jobChanges = new ArrayList<>(List.of(change));
1✔
1061
                                                while (!powerChanges.isEmpty()
1✔
1062
                                                                && change.isSameJob(powerChanges.peek())) {
×
1063
                                                        jobChanges.add(powerChanges.poll());
×
1064
                                                }
1065
                                                if (!jobChanges.isEmpty()) {
1✔
1066
                                                        log.debug("Running job changes {}", jobChanges);
1✔
1067
                                                        requests.add(new PowerRequest(bmpId, change.jobId(),
1✔
1068
                                                                        change.from(), change.to(), jobChanges));
1✔
1069
                                                }
1070
                                        }
1✔
1071

1072
                                        // Leave these until quiet
1073
                                        if (requests.isEmpty()) {
1✔
1074
                                                requests.addAll(getBlacklistReads.call(
1✔
1075
                                                                row -> new BoardRequest(bmpId, READ_BL, row),
1✔
1076
                                                                bmpId));
1✔
1077
                                        }
1078
                                        if (requests.isEmpty()) {
1✔
1079
                                                requests.addAll(getBlacklistWrites.call(
1✔
1080
                                                                row -> new BoardRequest(bmpId, WRITE_BL, row),
1✔
1081
                                                                bmpId));
1✔
1082
                                                requests.addAll(getReadSerialInfos.call(
1✔
1083
                                                                row -> new BoardRequest(bmpId, GET_SERIAL, row),
1✔
1084
                                                                bmpId));
1✔
1085
                                                requests.addAll(getReadTemps.call(
1✔
1086
                                                                row -> new BoardRequest(bmpId, READ_TEMP, row),
×
1087
                                                                bmpId));
1✔
1088
                                        }
1089
                                });
1✔
1090
                        } catch (Exception e) {
×
1091
                                log.error("unhandled exception for BMP '{}'", bmpId, e);
×
1092
                        }
1✔
1093
                        return requests;
1✔
1094
                }
1095
        }
1096

1097
        /**
1098
         * The testing interface.
1099
         *
1100
         * @hidden
1101
         */
1102
        @ForTestingOnly
1103
        public interface TestAPI {
1104
                /**
1105
                 * Ensure things are set up after a database change that updates the
1106
                 * BMPs in the system.
1107
                 */
1108
                void prepare();
1109

1110
                /**
1111
                 * The core of the scheduler.
1112
                 *
1113
                 * @param millis
1114
                 *            How many milliseconds to sleep before doing a rerun of the
1115
                 *            scheduler. If zero (or less), only one run will be done.
1116
                 * @param bmps
1117
                 *            The BMPs to be updated.
1118
                 * @throws IOException
1119
                 *             If talking to the network fails
1120
                 * @throws SpinnmanException
1121
                 *             If a BMP sends an error back
1122
                 * @throws InterruptedException
1123
                 *             If the wait for workers to spawn fails.
1124
                 */
1125
                void processRequests(long millis, Collection<Integer> bmps)
1126
                                throws IOException, SpinnmanException, InterruptedException;
1127

1128
                /**
1129
                 * The core of the scheduler. Will process for all known BMPs.
1130
                 *
1131
                 * @param millis
1132
                 *            How many milliseconds to sleep before doing a rerun of the
1133
                 *            scheduler. If zero (or less), only one run will be done.
1134
                 * @throws IOException
1135
                 *             If talking to the network fails
1136
                 * @throws SpinnmanException
1137
                 *             If a BMP sends an error back
1138
                 * @throws InterruptedException
1139
                 *             If the wait for workers to spawn fails.
1140
                 */
1141
                void processRequests(long millis)
1142
                                throws IOException, SpinnmanException, InterruptedException;
1143

1144
                /**
1145
                 * Get the most recently thrown BMP processing exception.
1146
                 *
1147
                 * @return Current processing exception.
1148
                 */
1149
                Throwable getBmpException();
1150

1151
                /** Clear the current processing exception. */
1152
                void clearBmpException();
1153
        }
1154

1155
        /**
1156
         * @return The test interface.
1157
         * @deprecated This interface is just for testing.
1158
         * @hidden
1159
         */
1160
        @ForTestingOnly
1161
        @RestrictedApi(explanation = "just for testing", link = "index.html",
1162
                        allowedOnPath = ".*/src/test/java/.*")
1163
        @Deprecated
1164
        public final TestAPI getTestAPI() {
1165
                ForTestingOnly.Utils.checkForTestClassOnStack();
1✔
1166
                return new TestAPI() {
1✔
1167
                        @Override
1168
                        public void prepare() {
1169
                                makeWorkers();
1✔
1170
                        }
1✔
1171

1172
                        @Override
1173
                        public void processRequests(long millis, Collection<Integer> bmps)
1174
                                        throws IOException, SpinnmanException,
1175
                                        InterruptedException {
1176
                                /*
1177
                                 * Runs twice because it takes two cycles to fully process a
1178
                                 * request.
1179
                                 */
1180
                                triggerSearch(bmps);
1✔
1181
                                if (millis > 0) {
1✔
1182
                                        Thread.sleep(millis);
1✔
1183
                                        triggerSearch(bmps);
1✔
1184
                                }
1185
                        }
1✔
1186

1187
                        @Override
1188
                        public void processRequests(long millis) throws IOException,
1189
                                        SpinnmanException, InterruptedException {
1190
                                processRequests(millis, workers.keySet());
1✔
1191
                        }
1✔
1192

1193
                        @Override
1194
                        public Throwable getBmpException() {
1195
                                synchronized (BMPController.this) {
1✔
1196
                                        return bmpProcessingException;
1✔
1197
                                }
1198
                        }
1199

1200
                        @Override
1201
                        public void clearBmpException() {
1202
                                synchronized (BMPController.this) {
1✔
1203
                                        bmpProcessingException = null;
1✔
1204
                                }
1✔
1205
                        }
1✔
1206
                };
1207
        }
1208
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc