• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

SpiNNakerManchester / JavaSpiNNaker / 6092811705

05 Sep 2023 06:30AM UTC coverage: 37.105% (+0.3%) from 36.847%
6092811705

push

github

web-flow
Merge pull request #1033 from SpiNNakerManchester/fix_retry_failure_case

Add power off timestamp to power on boards on failure

3 of 3 new or added lines in 1 file covered. (100.0%)

8712 of 23479 relevant lines covered (37.11%)

0.74 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

62.5
/SpiNNaker-allocserv/src/main/java/uk/ac/manchester/spinnaker/alloc/bmp/BMPController.java
1
/*
2
 * Copyright (c) 2021 The University of Manchester
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *     https://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
package uk.ac.manchester.spinnaker.alloc.bmp;
17

18
import static java.lang.String.format;
19
import static java.lang.Thread.currentThread;
20
import static java.lang.Thread.sleep;
21
import static java.util.Objects.requireNonNull;
22
import static org.slf4j.LoggerFactory.getLogger;
23
import static uk.ac.manchester.spinnaker.alloc.bmp.NonBootOperation.GET_SERIAL;
24
import static uk.ac.manchester.spinnaker.alloc.bmp.NonBootOperation.READ_BL;
25
import static uk.ac.manchester.spinnaker.alloc.bmp.NonBootOperation.READ_TEMP;
26
import static uk.ac.manchester.spinnaker.alloc.bmp.NonBootOperation.WRITE_BL;
27
import static uk.ac.manchester.spinnaker.alloc.model.JobState.DESTROYED;
28
import static uk.ac.manchester.spinnaker.alloc.model.JobState.QUEUED;
29
import static uk.ac.manchester.spinnaker.alloc.model.JobState.READY;
30

31
import java.io.IOException;
32
import java.lang.Thread.UncaughtExceptionHandler;
33
import java.time.Instant;
34
import java.util.ArrayList;
35
import java.util.Collection;
36
import java.util.HashMap;
37
import java.util.LinkedList;
38
import java.util.List;
39
import java.util.Map;
40
import java.util.Optional;
41
import java.util.function.Consumer;
42
import java.util.stream.Collectors;
43

44
import javax.annotation.PostConstruct;
45

46
import org.slf4j.Logger;
47
import org.springframework.beans.factory.ObjectProvider;
48
import org.springframework.beans.factory.annotation.Autowired;
49
import org.springframework.jmx.export.annotation.ManagedResource;
50
import org.springframework.scheduling.TaskScheduler;
51
import org.springframework.scheduling.concurrent.ThreadPoolTaskScheduler;
52
import org.springframework.stereotype.Service;
53

54
import com.google.errorprone.annotations.RestrictedApi;
55
import com.google.errorprone.annotations.concurrent.GuardedBy;
56

57
import uk.ac.manchester.spinnaker.alloc.ForTestingOnly;
58
import uk.ac.manchester.spinnaker.alloc.ServiceMasterControl;
59
import uk.ac.manchester.spinnaker.alloc.SpallocProperties.AllocatorProperties;
60
import uk.ac.manchester.spinnaker.alloc.SpallocProperties.TxrxProperties;
61
import uk.ac.manchester.spinnaker.alloc.admin.ReportMailSender;
62
import uk.ac.manchester.spinnaker.alloc.allocator.AllocatorTask;
63
import uk.ac.manchester.spinnaker.alloc.allocator.Epochs;
64
import uk.ac.manchester.spinnaker.alloc.allocator.SpallocAPI;
65
import uk.ac.manchester.spinnaker.alloc.db.DatabaseAPI.Connection;
66
import uk.ac.manchester.spinnaker.alloc.db.DatabaseAwareBean;
67
import uk.ac.manchester.spinnaker.alloc.db.Row;
68
import uk.ac.manchester.spinnaker.alloc.model.Direction;
69
import uk.ac.manchester.spinnaker.alloc.model.JobState;
70
import uk.ac.manchester.spinnaker.machine.board.BMPBoard;
71
import uk.ac.manchester.spinnaker.machine.board.BMPCoords;
72
import uk.ac.manchester.spinnaker.machine.board.HasBMPLocation;
73
import uk.ac.manchester.spinnaker.messages.model.ADCInfo;
74
import uk.ac.manchester.spinnaker.messages.model.Blacklist;
75
import uk.ac.manchester.spinnaker.transceiver.ProcessException;
76
import uk.ac.manchester.spinnaker.transceiver.ProcessException.CallerProcessException;
77
import uk.ac.manchester.spinnaker.transceiver.ProcessException.PermanentProcessException;
78
import uk.ac.manchester.spinnaker.transceiver.ProcessException.TransientProcessException;
79
import uk.ac.manchester.spinnaker.transceiver.SpinnmanException;
80
import uk.ac.manchester.spinnaker.utils.UsedInJavadocOnly;
81

82
/**
83
 * Manages the BMPs of machines controlled by Spalloc.
84
 *
85
 * @author Donal Fellows
86
 */
87
@Service("bmpController")
88
@ManagedResource("Spalloc:type=BMPController,name=bmpController")
89
public class BMPController extends DatabaseAwareBean {
3✔
90
        private static final Logger log = getLogger(BMPController.class);
3✔
91

92
        @Autowired
93
        private SpallocAPI spallocCore;
94

95
        @Autowired
96
        private ServiceMasterControl serviceControl;
97

98
        @Autowired
99
        private Epochs epochs;
100

101
        @Autowired
102
        private TxrxProperties props;
103

104
        @Autowired
105
        private PhysicalSerialMapping phySerMap;
106

107
        @Autowired
108
        private AllocatorProperties allocProps;
109

110
        @Autowired
111
        private ReportMailSender emailSender;
112

113
        @Autowired
114
        private AllocatorTask allocator;
115

116
        private TaskScheduler scheduler;
117

118
        /**
119
         * Map from BMP ID to worker task that handles it.
120
         */
121
        private final Map<Integer, Worker> workers = new HashMap<>();
3✔
122

123
        /**
124
         * Factory for {@linkplain SpiNNakerControl controllers}. Only use via
125
         * {@link #controllerFactory}.
126
         */
127
        @Autowired
128
        private ObjectProvider<SpiNNakerControl> controllerFactoryBean;
129

130
        /**
131
         * Type-safe factory for {@linkplain SpiNNakerControl controllers}.
132
         */
133
        private SpiNNakerControl.Factory controllerFactory;
134

135
        @GuardedBy("this")
136
        private Throwable bmpProcessingException;
137

138
        /**
139
         * An {@link UncaughtExceptionHandler}.
140
         *
141
         * @param thread
142
         *            The thread with the problem.
143
         * @param exception
144
         *            The exception that describes the problem.
145
         */
146
        @UsedInJavadocOnly(UncaughtExceptionHandler.class)
147
        private void handleException(Thread thread, Throwable exception) {
148
                log.error("uncaught exception in BMP worker {}", thread, exception);
×
149
        }
×
150

151
        // ----------------------------------------------------------------
152

153
        @PostConstruct
154
        private void init() {
155
                // Set up scheduler
156
                var sched = new ThreadPoolTaskScheduler();
3✔
157
                scheduler = sched;
3✔
158
                sched.setThreadGroupName("BMP");
3✔
159

160
                controllerFactory = controllerFactoryBean::getObject;
3✔
161
                allocator.setBMPController(this);
3✔
162

163
                // We do the making of workers later in tests
164
                List<Worker> madeWorkers = null;
3✔
165
                if (!serviceControl.isUseDummyBMP()) {
3✔
166
                        madeWorkers = makeWorkers();
×
167
                }
168

169
                // Set the pool size to match the number of workers
170
                if (workers.size() > 1) {
3✔
171
                        sched.setPoolSize(workers.size());
×
172
                }
173

174
                // Launch the scheduler now it is all set up
175
                sched.initialize();
3✔
176

177
                // And now use the scheduler
178
                if (madeWorkers != null) {
3✔
179
                        for (var worker : madeWorkers) {
×
180
                                scheduler.scheduleAtFixedRate(worker, allocProps.getPeriod());
×
181
                        }
×
182
                }
183
        }
3✔
184

185
        private List<Worker> makeWorkers() {
186
                // Make workers
187
                try (var c = getConnection();
3✔
188
                                var getBmps = c.query(GET_ALL_BMPS);
3✔
189
                                var getBoards = c.query(GET_ALL_BMP_BOARDS)) {
3✔
190
                        return c.transaction(false, () -> getBmps.call(row -> {
3✔
191
                                var m = spallocCore.getMachine(row.getString("machine_name"),
3✔
192
                                                true);
193
                                var coords = new BMPCoords(row.getInt("cabinet"),
3✔
194
                                                row.getInt("frame"));
3✔
195
                                var boards = new HashMap<BMPBoard, String>();
3✔
196
                                var bmpId = row.getInt("bmp_id");
3✔
197
                                getBoards.call(r -> {
3✔
198
                                        boards.put(new BMPBoard(r.getInt("board_num")),
3✔
199
                                                        r.getString("address"));
3✔
200
                                        return null;
3✔
201
                                }, bmpId);
3✔
202
                                var control = controllerFactory.create(m.get(), coords, boards);
3✔
203
                                var worker = new Worker(control, bmpId);
3✔
204
                                workers.put(row.getInt("bmp_id"), worker);
3✔
205
                                return worker;
3✔
206
                        }));
207
                }
208
        }
209

210
        /**
211
         * Trigger the execution of the workers for the given BMPs now.
212
         *
213
         * @param bmps
214
         *            A list of BMPs that have changed.
215
         */
216
        public void triggerSearch(Collection<Integer> bmps) {
217
                for (var b : bmps) {
3✔
218
                        var worker = workers.get(b);
3✔
219
                        if (worker != null) {
3✔
220
                                scheduler.schedule(() -> worker.run(), Instant.now());
3✔
221
                        } else {
222
                                log.error("Could not find worker for BMP {}", b);
3✔
223
                        }
224
                }
3✔
225
        }
3✔
226

227
        /** An action that may throw any of a range of exceptions. */
228
        private interface ThrowingAction {
229
                void act() throws ProcessException, IOException, InterruptedException;
230
        }
231

232
        private abstract class Request {
233
                final int bmpId;
234

235
                private int numTries = 0;
3✔
236

237
                Request(int bmpId) {
3✔
238
                        this.bmpId = bmpId;
3✔
239
                }
3✔
240

241
                /**
242
                 * @return Whether this request may be repeated.
243
                 */
244
                boolean isRepeat() {
245
                        return numTries < props.getPowerAttempts();
3✔
246
                }
247

248
                /**
249
                 * Basic machinery for handling exceptions that arise while performing a
250
                 * BMP action. Runs on a thread that may touch a BMP directly, but which
251
                 * may not touch the database.
252
                 * <p>
253
                 * Only subclasses should use this!
254
                 *
255
                 * @param body
256
                 *            What to attempt.
257
                 * @param onFailure
258
                 *            What to do on failure.
259
                 * @param onServiceRemove
260
                 *            If the exception looks serious, call this to trigger a
261
                 *            board being taken out of service.
262
                 * @return Whether to stop the retry loop.
263
                 * @throws InterruptedException
264
                 *             If interrupted.
265
                 */
266
                final boolean bmpAction(ThrowingAction body,
267
                                Consumer<Exception> onFailure,
268
                                Consumer<PermanentProcessException> onServiceRemove)
269
                                throws InterruptedException {
270
                        boolean isLastTry = numTries++ >= props.getPowerAttempts();
3✔
271
                        Exception exn;
272
                        try {
273
                                body.act();
3✔
274
                                // Exit the retry loop (up the stack); the requests all worked
275
                                return true;
3✔
276
                        } catch (InterruptedException e) {
×
277
                                /*
278
                                 * We were interrupted! This happens when we're shutting down.
279
                                 * Log (because we're in an inconsistent state) and rethrow so
280
                                 * that the outside gets to clean up.
281
                                 */
282
                                log.error("Requests failed on BMP {} because of "
×
283
                                                + "interruption", bmpId, e);
×
284
                                currentThread().interrupt();
×
285
                                throw e;
×
286
                        } catch (TransientProcessException e) {
×
287
                                if (!isLastTry) {
×
288
                                        // Log somewhat gently; we *might* be able to recover...
289
                                        log.warn("Retrying requests on BMP {} after {}: {}",
×
290
                                                        bmpId, props.getProbeInterval(),
×
291
                                                        e.getMessage());
×
292
                                        // Ask for a retry
293
                                        return false;
×
294
                                }
295
                                exn = e;
×
296
                                log.error("Requests failed on BMP {}", bmpId, e);
×
297
                        } catch (PermanentProcessException e) {
×
298
                                log.error("BMP {} on {} is unreachable", e.source, bmpId, e);
×
299
                                onServiceRemove.accept(e);
×
300
                                exn = e;
×
301
                        } catch (CallerProcessException e) {
×
302
                                // This is probably a software bug
303
                                log.error("SW bug talking to BMP {}", bmpId, e);
×
304
                                exn = e;
×
305
                        } catch (ProcessException | IOException | RuntimeException e) {
×
306
                                log.error("Requests failed on BMP {}", bmpId, e);
×
307
                                exn = e;
×
308
                        }
×
309
                        /*
310
                         * Common permanent failure handling case; arrange for taking a
311
                         * board out of service, mark a request as failed, and stop the
312
                         * retry loop.
313
                         */
314
                        onFailure.accept(exn);
×
315
                        return true;
×
316
                }
317

318
                /**
319
                 * Add a report to the database of a problem with a board.
320
                 *
321
                 * @param sql
322
                 *            How to talk to the DB
323
                 * @param boardId
324
                 *            Which board has the problem
325
                 * @param jobId
326
                 *            What job was associated with the problem (if any)
327
                 * @param msg
328
                 *            Information about what the problem was
329
                 */
330
                final void addBoardReport(Connection c, int boardId, Integer jobId,
331
                                String msg) {
332
                        try (var getUser = c.query(GET_USER_DETAILS_BY_NAME);
×
333
                                        var insertBoardReport = c.update(INSERT_BOARD_REPORT)) {
×
334
                                getUser.call1(row -> row.getInt("user_id"),
×
335
                                                allocProps.getSystemReportUser()).ifPresent(
×
336
                                                                userId -> insertBoardReport.call(
×
337
                                                                                boardId, jobId,        msg, userId));
×
338
                        }
339
                }
×
340

341
                /**
342
                 * Marks a board as actually dead, and requests we send email about it.
343
                 *
344
                 * @param sql
345
                 *            How to talk to the DB
346
                 * @param boardId
347
                 *            Which board has the problem
348
                 * @param msg
349
                 *            Information about what the problem was
350
                 * @return Whether we've successfully done a change.
351
                 */
352
                final void markBoardAsDead(Connection c, int boardId, String msg) {
353
                        try (var setFunctioning = c.update(SET_FUNCTIONING_FIELD);
×
354
                                        var findBoardById = c.query(FIND_BOARD_BY_ID)) {
×
355
                                boolean result = setFunctioning.call(false, boardId) > 0;
×
356
                                if (result) {
×
357
                                        findBoardById.call1(row -> {
×
358
                                                var ser = row.getString("physical_serial_id");
×
359
                                                if (ser == null) {
×
360
                                                        ser = "<UNKNOWN>";
×
361
                                                }
362
                                                var fullMessage = format(
×
363
                                                                "Marked board at %d,%d,%d of %s (serial: %s) "
364
                                                                                + "as dead: %s",
365
                                                                row.getInt("x"), row.getInt("y"),
×
366
                                                                row.getInt("z"), row.getString("machine_name"),
×
367
                                                                ser, msg);
368
                                                emailSender.sendServiceMail(fullMessage);
×
369
                                                return null;
×
370
                                        }, boardId);
×
371
                                }
372
                        }
373
                }
×
374

375
                boolean processRequest(SpiNNakerControl control) {
376
                        while (isRepeat()) {
3✔
377
                                try {
378
                                        if (tryProcessRequest(control)) {
3✔
379
                                                return true;
3✔
380
                                        }
381
                                        sleep(props.getProbeInterval().toMillis());
×
382
                                } catch (InterruptedException e) {
×
383
                                        // If this happens, just cancel the transaction;
384
                                        // when we come back, all things will be redone.
385
                                        throw new RuntimeException(e);
×
386
                                }
×
387
                        }
388
                        return false;
×
389
                }
390

391
                abstract boolean tryProcessRequest(SpiNNakerControl control)
392
                                throws InterruptedException;
393
        }
394

395
        /**
396
         * Describes a request to modify the power status of a collection of boards.
397
         * The boards must be on a single machine and must all be assigned to a
398
         * single job.
399
         * <p>
400
         * This is the message that is sent from the main thread to the per-BMP
401
         * worker threads.
402
         *
403
         * @author Donal Fellows
404
         */
405
        private final class PowerRequest extends Request {
406
                private final List<BMPBoard> powerOnBoards = new ArrayList<>();
3✔
407

408
                private final List<BMPBoard> powerOffBoards = new ArrayList<>();
3✔
409

410
                private final List<Link> linkRequests = new ArrayList<>();
3✔
411

412
                private final int jobId;
413

414
                private final JobState from;
415

416
                private final JobState to;
417

418
                private final List<Integer> changeIds = new ArrayList<>();
3✔
419

420
                private final Map<Integer, Integer> boardToId = new HashMap<>();
3✔
421

422
                /**
423
                 * Create a request.
424
                 *
425
                 * @param sql
426
                 *            How to access the database.
427
                 * @param machine
428
                 *            What machine are the boards on? <em>Must not</em> be
429
                 *            {@code null}.
430
                 * @param powerOn
431
                 *            What boards (by DB ID) are to be powered on? May be
432
                 *            {@code null}; that's equivalent to the empty list.
433
                 * @param powerOff
434
                 *            What boards (by DB ID) are to be powered off? May be
435
                 *            {@code null}; that's equivalent to the empty list.
436
                 * @param links
437
                 *            Any link power control requests. By default, links are on
438
                 *            if their board is on and they are connected; it is
439
                 *            <em>useful and relevant</em> to modify the power state of
440
                 *            links on the periphery of an allocation. May be
441
                 *            {@code null}; that's equivalent to the empty list.
442
                 * @param jobId
443
                 *            For what job is this?
444
                 * @param from
445
                 *            What state is the job moving from?
446
                 * @param to
447
                 *            What state is the job moving to?
448
                 * @param changeIds
449
                 *            The DB ids that describe the change, so we can update
450
                 *            those records.
451
                 * @param idToBoard
452
                 *            How to get the physical ID of a board from its database ID
453
                 */
454
                PowerRequest(int bmpId, int jobId, JobState from, JobState to,
455
                                List<PowerChange> powerChanges) {
3✔
456
                        super(bmpId);
3✔
457
                        for (var change : powerChanges) {
3✔
458
                                if (change.power) {
3✔
459
                                        powerOnBoards.add(new BMPBoard(change.boardNum));
3✔
460
                                } else {
461
                                        powerOffBoards.add(new BMPBoard(change.boardNum));
3✔
462
                                }
463
                                change.offLinks.stream().forEach(link ->
3✔
464
                                                linkRequests.add(new Link(change.boardNum, link)));
3✔
465
                                changeIds.add(change.changeId);
3✔
466
                                boardToId.put(change.boardNum, change.boardId);
3✔
467
                        }
3✔
468
                        this.jobId = jobId;
3✔
469
                        this.from = from;
3✔
470
                        this.to = to;
3✔
471
                }
3✔
472

473
                /**
474
                 * Change the power state of boards in this request.
475
                 *
476
                 * @param controllers
477
                 *            How to actually communicate with the machine
478
                 * @throws ProcessException
479
                 *             If the transceiver chokes
480
                 * @throws InterruptedException
481
                 *             If interrupted
482
                 * @throws IOException
483
                 *             If network I/O fails
484
                 */
485
                void changeBoardPowerState(SpiNNakerControl controller)
486
                                throws ProcessException, InterruptedException, IOException {
487

488
                        // Send any power on commands
489
                        if (!powerOnBoards.isEmpty()) {
3✔
490
                                controller.powerOnAndCheck(powerOnBoards);
3✔
491
                        }
492

493
                        // Process perimeter link requests next
494
                        for (var linkReq : linkRequests) {
3✔
495
                                // Set the link state, as required
496
                                controller.setLinkOff(linkReq);
3✔
497
                        }
3✔
498

499
                        // Finally send any power off commands
500
                        if (!powerOffBoards.isEmpty()) {
3✔
501
                                controller.powerOff(powerOffBoards);
3✔
502
                        }
503
                }
3✔
504

505
                /**
506
                 * Handles the database changes after a set of changes to a BMP complete
507
                 * successfully. We will move the job to the state it supposed to be in.
508
                 *
509
                 * @param sql
510
                 *            How to access the DB
511
                 * @return Whether the state of boards or jobs has changed.
512
                 */
513
                private void done() {
514
                        try (var c = getConnection();
3✔
515
                                        var deallocateBoards = c.update(DEALLOCATE_BMP_BOARDS_JOB);
3✔
516
                                        var deleteChange = c.update(FINISHED_PENDING);
3✔
517
                                        var setBoardPowerOn = c.update(SET_BOARD_POWER_ON);
3✔
518
                                        var setBoardPowerOff = c.update(SET_BOARD_POWER_OFF)) {
3✔
519
                                c.transaction(() -> {
3✔
520
                                        int turnedOn = powerOnBoards.stream().map(this::getBoardId)
3✔
521
                                                        .mapToInt(setBoardPowerOn::call).sum();
3✔
522
                                        int turnedOff =
3✔
523
                                                        powerOffBoards.stream().map(this::getBoardId)
3✔
524
                                                                        .mapToInt(setBoardPowerOff::call).sum();
3✔
525

526
                                        if (to == DESTROYED || to == QUEUED) {
3✔
527
                                                /*
528
                                                 * Need to mark the boards as not allocated; can't do
529
                                                 * that until they've been switched off.
530
                                                 */
531
                                                deallocateBoards.call(jobId, bmpId);
3✔
532
                                        }
533
                                        int completed = changeIds.stream().mapToInt(
3✔
534
                                                        deleteChange::call).sum();
3✔
535

536
                                        log.debug("BMP ACTION SUCCEEDED ({}:{}->{}): on:{} off:{} "
3✔
537
                                                        + "completed: {}",
538
                                                        jobId, from, to, turnedOn, turnedOff, completed);
3✔
539
                                });
3✔
540
                        }
541

542
                        // Tell the allocator something has happened
543
                        allocator.updateJob(jobId, from, to);
3✔
544
                }
3✔
545

546
                /**
547
                 * Handles the database changes after a set of changes to a BMP complete
548
                 * with a failure. We will roll back the job state to what it was
549
                 * before.
550
                 *
551
                 * @param sql
552
                 *            How to access the DB
553
                 * @return Whether the state of boards or jobs has changed.
554
                 */
555
                private void failed() {
556
                        var resetJobAlloc = false;
×
557
                        try (var c = getConnection();
×
558
                                        var deallocateBoards = c.update(DEALLOCATE_BMP_BOARDS_JOB);
×
559
                                        var deleteChange = c.update(FINISHED_PENDING);
×
560
                                        var setBoardPowerOff = c.update(SET_BOARD_POWER_OFF)) {
×
561
                                resetJobAlloc = c.transaction(() -> {
×
562
                                        // We should mark the boards as off
563
                                        int turnedOff =
×
564
                                                        powerOffBoards.stream().map(this::getBoardId)
×
565
                                                                        .mapToInt(setBoardPowerOff::call).sum();
×
566

567
                                        // ... even those that we should be powering on ...
568
                                        turnedOff +=
×
569
                                                        powerOnBoards.stream().map(this::getBoardId)
×
570
                                                                        .mapToInt(setBoardPowerOff::call).sum();
×
571

572
                                        // Deallocate the boards on this bmp from the job;
573
                                        // other boards can be deallocated elsewhere.
574
                                        deallocateBoards.call(jobId, bmpId);
×
575

576
                                        // Delete change ids as they are done even if failed.
577
                                        var completed = changeIds.stream().mapToInt(
×
578
                                                        deleteChange::call).sum();
×
579

580
                                        log.debug(
×
581
                                                        "BMP ACTION FAILED on {} ({}:{}->{}) off:{} "
582
                                                        + "completed:{}",
583
                                                        bmpId, jobId, from, to, turnedOff, completed);
×
584

585
                                        // If we were meant to be powering up, reset the allocation
586
                                        // once done here.
587
                                        return (to == READY && powerOffBoards.isEmpty());
×
588
                                });
589
                        }
590
                        if (resetJobAlloc) {
×
591
                                allocator.resetPowerOnFailure(jobId);
×
592
                        }
593
                }
×
594

595
                /**
596
                 * Process an action to power on or off a set of boards. Runs on a
597
                 * thread that may touch a BMP directly, but which may not touch the
598
                 * database.
599
                 *
600
                 * @param controller
601
                 *            How to actually reach the BMPs.
602
                 * @return Whether this action has "succeeded" and shouldn't be retried.
603
                 * @throws InterruptedException
604
                 *             If interrupted.
605
                 */
606
                @Override
607
                boolean tryProcessRequest(SpiNNakerControl controller)
608
                                throws InterruptedException {
609
                        boolean ok = bmpAction(() -> {
3✔
610
                                changeBoardPowerState(controller);
3✔
611
                                // We want to ensure the lead board is alive
612
                                if (!serviceControl.isUseDummyBMP()) {
3✔
613
                                        // Don't bother with pings when the dummy is enabled
614
                                        controller.ping(powerOnBoards);
×
615
                                }
616
                                done();
3✔
617
                        }, e -> {
3✔
618
                                failed();
×
619
                                synchronized (BMPController.this) {
×
620
                                        bmpProcessingException = e;
×
621
                                }
×
622
                        }, ppe -> {
×
623
                                badBoard(ppe);
×
624
                        });
×
625
                        return ok;
3✔
626
                }
627

628
                @Override
629
                public String toString() {
630
                        var sb = new StringBuilder("PowerRequest(for=")
×
631
                                        .append(bmpId);
×
632
                        sb.append(";on=").append(powerOnBoards);
×
633
                        sb.append(",off=").append(powerOffBoards);
×
634
                        sb.append(",links=").append(linkRequests);
×
635
                        return sb.append(")").toString();
×
636
                }
637

638
                private static final String REPORT_MSG =
639
                                "board was not reachable when trying to power it: ";
640

641
                /**
642
                 * When a BMP is unroutable, we must tell the alloc engine to pick
643
                 * somewhere else, and we should mark the board as out of service too;
644
                 * it's never going to work so taking it out right away is the only sane
645
                 * plan. We also need to nuke the planned changes. Retrying is bad.
646
                 *
647
                 * @param failure
648
                 *            The failure message.
649
                 * @return Whether the state of boards or jobs has changed.
650
                 */
651
                private void badBoard(ProcessException failure) {
652
                        try (var c = getConnection()) {
×
653
                                c.transaction(() -> {
×
654
                                        getBoardId(failure.source).ifPresent(boardId -> {
×
655
                                                // Mark the board as dead right now
656
                                                markBoardAsDead(c, boardId, REPORT_MSG + failure);
×
657
                                                // Add a report if we can
658
                                                addBoardReport(c, boardId, jobId, REPORT_MSG + failure);
×
659
                                        });
×
660
                                });
×
661
                        }
662
                }
×
663

664
                /**
665
                 * Given a board address, get the ID that it corresponds to. Reverses
666
                 * {@link #idToBoard}.
667
                 *
668
                 * @param addr
669
                 *            The board address.
670
                 * @return The ID, if one can be found.
671
                 */
672
                private Optional<Integer> getBoardId(HasBMPLocation addr) {
673
                        return Optional.ofNullable(boardToId.get(addr.getBoard()));
×
674
                }
675

676
                private Integer getBoardId(BMPBoard board) {
677
                        return boardToId.get(board.board);
3✔
678
                }
679
        }
680

681
        /**
682
         * A request to read or write information on a BMP. Includes blacklists,
683
         * serial numbers, temperature data, etc.
684
         *
685
         * @author Donal Fellows
686
         */
687
        private final class BoardRequest extends Request {
688
                private final NonBootOperation op;
689

690
                private final int opId;
691

692
                private final int boardId;
693

694
                private final BMPCoords bmp;
695

696
                private final BMPBoard board;
697

698
                private final String bmpSerialId;
699

700
                private final Blacklist blacklist;
701

702
                private final int machineId;
703

704
                private BoardRequest(int bmpId, NonBootOperation op, Row row) {
3✔
705
                        super(bmpId);
3✔
706
                        this.op = op;
3✔
707
                        opId = row.getInt("op_id");
3✔
708
                        boardId = row.getInt("board_id");
3✔
709
                        bmp = new BMPCoords(row.getInt("cabinet"), row.getInt("frame"));
3✔
710
                        board = new BMPBoard(row.getInt("board_num"));
3✔
711
                        if (op == WRITE_BL) {
3✔
712
                                blacklist = row.getSerial("data", Blacklist.class);
3✔
713
                        } else {
714
                                blacklist = null;
3✔
715
                        }
716
                        bmpSerialId = row.getString("bmp_serial_id");
3✔
717
                        machineId = row.getInt("machine_id");
3✔
718
                }
3✔
719

720
                /** The serial number actually read from the board. */
721
                private String readSerial;
722

723
                /**
724
                 * Access the DB to store the serial number information that we
725
                 * retrieved. A transaction should already be held.
726
                 *
727
                 * @param c
728
                 *            How to access the DB
729
                 * @return Whether we've changed anything
730
                 */
731
                private void recordSerialIds(Connection c) {
732
                        try (var setBoardSerialIds = c.update(SET_BOARD_SERIAL_IDS)) {
3✔
733
                                setBoardSerialIds.call(boardId, readSerial,
3✔
734
                                                phySerMap.getPhysicalId(readSerial));
3✔
735
                        }
736
                }
3✔
737

738
                /**
739
                 * Access the DB to mark the read request as successful and store the
740
                 * blacklist that was read. A transaction should already be held.
741
                 *
742
                 * @param c
743
                 *            How to access the DB
744
                 * @param readBlacklist
745
                 *            The blacklist that was read
746
                 * @return Whether we've changed anything
747
                 */
748
                private void doneReadBlacklist(Connection c, Blacklist readBlacklist) {
749
                        try (var completed = c.update(COMPLETED_BOARD_INFO_READ)) {
3✔
750
                                log.debug("Completing blacklist read opId {}", opId);
3✔
751
                                completed.call(readBlacklist, opId);
3✔
752
                        }
753
                }
3✔
754

755
                /**
756
                 * Access the DB to mark the write request as successful. A transaction
757
                 * should already be held.
758
                 *
759
                 * @param c
760
                 *            How to access the DB
761
                 * @return Whether we've changed anything
762
                 */
763
                private void doneWriteBlacklist(Connection c) {
764
                        try (var completed = c.update(COMPLETED_BLACKLIST_WRITE)) {
3✔
765
                                completed.call(opId);
3✔
766
                        }
767
                }
3✔
768

769
                /**
770
                 * Access the DB to mark the read request as successful; the actual
771
                 * store of the serial data is elsewhere
772
                 * ({@link #recordSerialIds(Connection)}). A transaction should already
773
                 * be held.
774
                 *
775
                 * @param c
776
                 *            How to access the DB
777
                 * @return Whether we've changed anything
778
                 */
779
                private void doneReadSerial(Connection c) {
780
                        try (var completed = c.update(COMPLETED_GET_SERIAL_REQ)) {
3✔
781
                                completed.call(opId);
3✔
782
                        }
783
                }
3✔
784

785
                /**
786
                 * Access the DB to mark the read request as successful and store the
787
                 * ADC info that was read. A transaction should be held.
788
                 *
789
                 * @param c
790
                 *            The database connection.
791
                 */
792
                private void doneReadTemps(Connection c, ADCInfo adcInfo) {
793
                        try (var completed = c.update(COMPLETED_BOARD_INFO_READ)) {
×
794
                                log.debug("Completing temperature read opId {}", opId);
×
795
                                completed.call(adcInfo, opId);
×
796
                        }
797
                }
×
798

799
                /**
800
                 * Access the DB to mark the request as failed and store the exception.
801
                 *
802
                 * @param exn
803
                 *            The exception that caused the failure.
804
                 * @return Whether we've changed anything
805
                 */
806
                private void failed(Exception exn) {
807
                        try (var c = getConnection();
×
808
                                        var failed = c.update(FAILED_BLACKLIST_OP)) {
×
809
                                c.transaction(() -> failed.call(exn, opId));
×
810
                        }
811
                }
×
812

813
                private static final String REPORT_MSG =
814
                                "board was not reachable when trying to access its blacklist: ";
815

816
                /**
817
                 * Access the DB to mark a board as out of service.
818
                 *
819
                 * @param exn
820
                 *            The exception that caused the failure.
821
                 * @return Whether we've changed anything
822
                 */
823
                void takeOutOfService(Exception exn) {
824
                        try (var c = getConnection()) {
×
825
                                c.transaction(() -> {
×
826
                                        addBoardReport(c, boardId, null, REPORT_MSG + exn);
×
827
                                        markBoardAsDead(c, boardId, REPORT_MSG + exn);
×
828
                                });
×
829
                        }
830
                }
×
831

832
                /**
833
                 * Process an action to work with a blacklist or serial number. Runs on
834
                 * a thread that may touch a BMP directly, but which may not touch the
835
                 * database.
836
                 *
837
                 * @param controller
838
                 *            How to actually reach the BMP.
839
                 * @return Whether this action has "succeeded" and shouldn't be retried.
840
                 * @throws InterruptedException
841
                 *             If interrupted.
842
                 */
843
                @Override
844
                boolean tryProcessRequest(SpiNNakerControl controller)
845
                                throws InterruptedException {
846
                        return bmpAction(() -> {
3✔
847
                                switch (op) {
3✔
848
                                case WRITE_BL:
849
                                        writeBlacklist(controller);
3✔
850
                                        break;
3✔
851
                                case READ_BL:
852
                                        readBlacklist(controller);
3✔
853
                                        break;
3✔
854
                                case GET_SERIAL:
855
                                        readSerial(controller);
3✔
856
                                        break;
3✔
857
                                case READ_TEMP:
858
                                        readTemps(controller);
×
859
                                        break;
×
860
                                default:
861
                                        throw new IllegalArgumentException();
×
862
                                }
863
                                epochs.blacklistChanged(boardId);
3✔
864
                                epochs.machineChanged(machineId);
3✔
865
                        }, e -> {
3✔
866
                                failed(e);
×
867
                        }, ppe -> {
×
868
                                takeOutOfService(ppe);
×
869
                        });
×
870
                }
871

872
                /**
873
                 * Process an action to read a blacklist.
874
                 *
875
                 * @param controller
876
                 *            How to actually reach the BMP.
877
                 * @throws InterruptedException
878
                 *             If interrupted.
879
                 * @throws IOException
880
                 *             If the network is unhappy.
881
                 * @throws ProcessException
882
                 *             If the BMP rejects a message.
883
                 */
884
                private void readBlacklist(SpiNNakerControl controller)
885
                                throws InterruptedException, ProcessException, IOException {
886
                        readSerial = controller.readSerial(board);
3✔
887
                        if (bmpSerialId != null && !bmpSerialId.equals(readSerial)) {
3✔
888
                                /*
889
                                 * Doesn't match; WARN but keep going; hardware may just be
890
                                 * remapped behind our back.
891
                                 */
892
                                log.warn(
×
893
                                                "blacklist read mismatch: expected serial ID '{}' "
894
                                                                + "not equal to actual serial ID '{}'",
895
                                                bmpSerialId, readSerial);
896
                        }
897
                        var readBlacklist = controller.readBlacklist(board);
3✔
898
                        try (var c = getConnection()) {
3✔
899
                                c.transaction(() -> {
3✔
900
                                        recordSerialIds(c);
3✔
901
                                        doneReadBlacklist(c, readBlacklist);
3✔
902
                                });
3✔
903
                        }
904
                }
3✔
905

906
                /**
907
                 * Process an action to write a blacklist.
908
                 *
909
                 * @param controller
910
                 *            How to actually reach the BMP.
911
                 * @throws InterruptedException
912
                 *             If interrupted.
913
                 * @throws IOException
914
                 *             If the network is unhappy.
915
                 * @throws ProcessException
916
                 *             If the BMP rejects a message.
917
                 * @throws IllegalStateException
918
                 *             If the operation is applied to a board other than the one
919
                 *             that it is expected to apply to.
920
                 */
921
                private void writeBlacklist(SpiNNakerControl controller)
922
                                throws InterruptedException, ProcessException, IOException {
923
                        readSerial = controller.readSerial(board);
3✔
924
                        if (bmpSerialId != null && !bmpSerialId.equals(readSerial)) {
3✔
925
                                // Doesn't match, so REALLY unsafe to keep going!
926
                                throw new IllegalStateException(format(
×
927
                                                "aborting blacklist write: expected serial ID '%s' "
928
                                                                + "not equal to actual serial ID '%s'",
929
                                                bmpSerialId, readSerial));
930
                        }
931
                        controller.writeBlacklist(board, requireNonNull(blacklist));
3✔
932
                        try (var c = getConnection()) {
3✔
933
                                c.transaction(() -> doneWriteBlacklist(c));
3✔
934
                        }
935
                }
3✔
936

937
                /**
938
                 * Process an action to read the serial number from a BMP.
939
                 *
940
                 * @param controller
941
                 *            How to actually reach the BMP.
942
                 * @throws InterruptedException
943
                 *             If interrupted.
944
                 * @throws IOException
945
                 *             If the network is unhappy
946
                 * @throws ProcessException
947
                 *             If the BMP rejects a message.
948
                 */
949
                private void readSerial(SpiNNakerControl controller)
950
                                throws InterruptedException, ProcessException, IOException {
951
                        readSerial = controller.readSerial(board);
3✔
952
                        try (var c = getConnection()) {
3✔
953
                                c.transaction(() -> {
3✔
954
                                        recordSerialIds(c);
3✔
955
                                        doneReadSerial(c);
3✔
956
                                });
3✔
957
                        }
958
                }
3✔
959

960
                /**
961
                 * Process an action to read some temperature data.
962
                 *
963
                 * @param controller
964
                 *            How to actually reach the BMP.
965
                 * @throws InterruptedException
966
                 *             If interrupted.
967
                 * @throws IOException
968
                 *             If the network is unhappy.
969
                 * @throws ProcessException
970
                 *             If the BMP rejects a message.
971
                 */
972
                private void readTemps(SpiNNakerControl controller)
973
                                throws InterruptedException, ProcessException, IOException {
974
                        var adcInfo = controller.readTemp(board);
×
975
                        try (var c = getConnection()) {
×
976
                                c.transaction(() -> doneReadTemps(c, adcInfo));
×
977
                        }
978
                }
×
979

980
                @Override
981
                public String toString() {
982
                        var sb = new StringBuilder("BoardRequest(for ");
×
983
                        sb.append("bmp=").append(bmp);
×
984
                        sb.append(",board=").append(boardId);
×
985
                        sb.append(",op=").append(op);
×
986
                        return sb.append(")").toString();
×
987
                }
988
        }
989

990
        private class PowerChange {
991
                final Integer changeId;
992

993
                final int jobId;
994

995
                final Integer boardId;
996

997
                final Integer boardNum;
998

999
                final boolean power;
1000

1001
                final JobState from;
1002

1003
                final JobState to;
1004

1005
                final List<Direction> offLinks;
1006

1007
                PowerChange(Row row) {
3✔
1008
                        changeId = row.getInteger("change_id");
3✔
1009
                        jobId = row.getInt("job_id");
3✔
1010
                        boardId = row.getInteger("board_id");
3✔
1011
                        boardNum = row.getInteger("board_num");
3✔
1012
                        power = row.getBoolean("power");
3✔
1013
                        from = row.getEnum("from_state", JobState.class);
3✔
1014
                        to = row.getEnum("to_state", JobState.class);
3✔
1015
                        offLinks = List.of(Direction.values()).stream().filter(
3✔
1016
                                        link -> !row.getBoolean(link.columnName)).collect(
3✔
1017
                                                        Collectors.toList());
3✔
1018
                }
3✔
1019

1020
                boolean isSameJob(PowerChange p) {
1021
                        return p.jobId == jobId && p.from == from && p.to == to;
×
1022
                }
1023
        }
1024

1025
        // ----------------------------------------------------------------
1026
        // WORKER IMPLEMENTATION
1027

1028
        /** A worker of a given BMP. */
1029
        private final class Worker implements Runnable {
1030
                /** What are we controlling? */
1031
                private final SpiNNakerControl control;
1032

1033
                /** Which boards are we looking at? */
1034
                private final int bmpId;
1035

1036
                Worker(SpiNNakerControl control, int bmpId) {
3✔
1037
                        this.control = control;
3✔
1038
                        this.bmpId = bmpId;
3✔
1039

1040
                        log.debug("Created worker for boards {}", bmpId);
3✔
1041
                }
3✔
1042

1043
                /**
1044
                 * Periodically call to update, or trigger externally.
1045
                 */
1046
                @Override
1047
                public synchronized void run() {
1048
                        log.trace("Searching for changes on BMP {}", bmpId);
3✔
1049

1050
                        try {
1051
                                var changes = getRequestedOperations();
3✔
1052
                                for (var change : changes) {
3✔
1053
                                        change.processRequest(control);
3✔
1054
                                }
3✔
1055
                        } catch (Exception e) {
×
1056
                                log.error("unhandled exception for BMP '{}'", bmpId, e);
×
1057
                        }
3✔
1058
                }
3✔
1059

1060
                /**
1061
                 * Get the things that we want the worker to do. <em>Be very
1062
                 * careful!</em> Because this necessarily involves the database, this
1063
                 * must not touch the BMP handle as those operations take a long time
1064
                 * and we absolutely must not have a transaction open at the same time.
1065
                 *
1066
                 * @return List of operations to perform.
1067
                 */
1068
                private List<Request> getRequestedOperations() {
1069
                        var requests = new ArrayList<Request>();
3✔
1070
                        try (var c = getConnection();
3✔
1071
                                        var getPowerRequests = c.query(GET_CHANGES);
3✔
1072
                                        var getBlacklistReads = c.query(GET_BLACKLIST_READS);
3✔
1073
                                        var getBlacklistWrites = c.query(GET_BLACKLIST_WRITES);
3✔
1074
                                        var getReadSerialInfos = c.query(GET_SERIAL_INFO_REQS);
3✔
1075
                                        var getReadTemps = c.query(GET_TEMP_INFO_REQS)) {
3✔
1076
                                c.transaction(false, () -> {
3✔
1077
                                        // Batch power requests by job
1078
                                        var powerChanges = new LinkedList<>(
3✔
1079
                                                        getPowerRequests.call(PowerChange::new, bmpId));
3✔
1080
                                        while (!powerChanges.isEmpty()) {
3✔
1081
                                                var change = powerChanges.poll();
3✔
1082
                                                var jobChanges = new ArrayList<>(List.of(change));
3✔
1083
                                                while (!powerChanges.isEmpty()
3✔
1084
                                                                && change.isSameJob(powerChanges.peek())) {
×
1085
                                                        jobChanges.add(powerChanges.poll());
×
1086
                                                }
1087
                                                if (!jobChanges.isEmpty()) {
3✔
1088
                                                        log.debug("Running job changes {}", jobChanges);
3✔
1089
                                                        requests.add(new PowerRequest(bmpId, change.jobId,
3✔
1090
                                                                        change.from, change.to, jobChanges));
1091
                                                }
1092
                                        }
3✔
1093

1094
                                        // Leave these until quiet
1095
                                        if (requests.isEmpty()) {
3✔
1096
                                                requests.addAll(getBlacklistReads.call(
3✔
1097
                                                                row -> new BoardRequest(bmpId, READ_BL, row),
3✔
1098
                                                                bmpId));
3✔
1099
                                        }
1100
                                        if (requests.isEmpty()) {
3✔
1101
                                                requests.addAll(getBlacklistWrites.call(
3✔
1102
                                                                row -> new BoardRequest(bmpId, WRITE_BL, row),
3✔
1103
                                                                bmpId));
3✔
1104
                                                requests.addAll(getReadSerialInfos.call(
3✔
1105
                                                                row -> new BoardRequest(bmpId, GET_SERIAL, row),
3✔
1106
                                                                bmpId));
3✔
1107
                                                requests.addAll(getReadTemps.call(
3✔
1108
                                                                row -> new BoardRequest(bmpId, READ_TEMP, row),
×
1109
                                                                bmpId));
3✔
1110
                                        }
1111
                                });
3✔
1112
                        } catch (Exception e) {
×
1113
                                log.error("unhandled exception for BMP '{}'", bmpId, e);
×
1114
                        }
3✔
1115
                        return requests;
3✔
1116
                }
1117
        }
1118

1119
        /**
1120
         * The testing interface.
1121
         *
1122
         * @hidden
1123
         */
1124
        @ForTestingOnly
1125
        public interface TestAPI {
1126
                /**
1127
                 * Ensure things are set up after a database change that updates the
1128
                 * BMPs in the system.
1129
                 */
1130
                void prepare();
1131

1132
                /**
1133
                 * The core of the scheduler.
1134
                 *
1135
                 * @param millis
1136
                 *            How many milliseconds to sleep before doing a rerun of the
1137
                 *            scheduler. If zero (or less), only one run will be done.
1138
                 * @param bmps
1139
                 *            The BMPs to be updated.
1140
                 * @throws IOException
1141
                 *             If talking to the network fails
1142
                 * @throws SpinnmanException
1143
                 *             If a BMP sends an error back
1144
                 * @throws InterruptedException
1145
                 *             If the wait for workers to spawn fails.
1146
                 */
1147
                void processRequests(long millis, Collection<Integer> bmps)
1148
                                throws IOException, SpinnmanException, InterruptedException;
1149

1150
                /**
1151
                 * The core of the scheduler. Will process for all known BMPs.
1152
                 *
1153
                 * @param millis
1154
                 *            How many milliseconds to sleep before doing a rerun of the
1155
                 *            scheduler. If zero (or less), only one run will be done.
1156
                 * @throws IOException
1157
                 *             If talking to the network fails
1158
                 * @throws SpinnmanException
1159
                 *             If a BMP sends an error back
1160
                 * @throws InterruptedException
1161
                 *             If the wait for workers to spawn fails.
1162
                 */
1163
                void processRequests(long millis)
1164
                                throws IOException, SpinnmanException, InterruptedException;
1165

1166
                /**
1167
                 * Get the last BMP exception.
1168
                 *
1169
                 * @return The exception.
1170
                 */
1171
                Throwable getBmpException();
1172

1173
                /**
1174
                 * Clear the last BMP exception.
1175
                 */
1176
                void clearBmpException();
1177
        }
1178

1179
        /**
1180
         * @return The test interface.
1181
         * @deprecated This interface is just for testing.
1182
         * @hidden
1183
         */
1184
        @ForTestingOnly
1185
        @RestrictedApi(explanation = "just for testing", link = "index.html",
1186
                        allowedOnPath = ".*/src/test/java/.*")
1187
        @Deprecated
1188
        public final TestAPI getTestAPI() {
1189
                ForTestingOnly.Utils.checkForTestClassOnStack();
3✔
1190
                return new TestAPI() {
3✔
1191
                        @Override
1192
                        public void prepare() {
1193
                                makeWorkers();
3✔
1194
                        }
3✔
1195

1196
                        @Override
1197
                        public void processRequests(long millis, Collection<Integer> bmps)
1198
                                        throws IOException, SpinnmanException,
1199
                                        InterruptedException {
1200
                                /*
1201
                                 * Runs twice because it takes two cycles to fully process a
1202
                                 * request.
1203
                                 */
1204
                                triggerSearch(bmps);
3✔
1205
                                if (millis > 0) {
3✔
1206
                                        Thread.sleep(millis);
3✔
1207
                                        triggerSearch(bmps);
3✔
1208
                                }
1209
                        }
3✔
1210

1211
                        @Override
1212
                        public void processRequests(long millis) throws IOException,
1213
                                        SpinnmanException, InterruptedException {
1214
                                processRequests(millis, workers.keySet());
3✔
1215
                        }
3✔
1216

1217
                        @Override
1218
                        public Throwable getBmpException() {
1219
                                synchronized (BMPController.this) {
3✔
1220
                                        return bmpProcessingException;
3✔
1221
                                }
1222
                        }
1223

1224
                        @Override
1225
                        public void clearBmpException() {
1226
                                synchronized (BMPController.this) {
3✔
1227
                                        bmpProcessingException = null;
3✔
1228
                                }
3✔
1229
                        }
3✔
1230
                };
1231
        }
1232
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc