• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

sxs-collaboration / spectre / 4245613002

pending completion
4245613002

push

github

GitHub
Merge pull request #4758 from nilsvu/exit_codes

4 of 4 new or added lines in 2 files covered. (100.0%)

63933 of 66631 relevant lines covered (95.95%)

431414.95 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.24
/src/Parallel/PhaseControl/CheckpointAndExitAfterWallclock.hpp
1
// Distributed under the MIT License.
2
// See LICENSE.txt for details.
3

4
#pragma once
5

6
#include <optional>
7
#include <pup.h>
8
#include <string>
9
#include <type_traits>
10
#include <utility>
11

12
#include "Options/Auto.hpp"
13
#include "Options/Options.hpp"
14
#include "Parallel/AlgorithmMetafunctions.hpp"
15
#include "Parallel/CharmPupable.hpp"
16
#include "Parallel/ExitCode.hpp"
17
#include "Parallel/GlobalCache.hpp"
18
#include "Parallel/Phase.hpp"
19
#include "Parallel/PhaseControl/ContributeToPhaseChangeReduction.hpp"
20
#include "Parallel/PhaseControl/PhaseChange.hpp"
21
#include "Utilities/ErrorHandling/Assert.hpp"
22
#include "Utilities/Functional.hpp"
23
#include "Utilities/System/ParallelInfo.hpp"
24
#include "Utilities/TMPL.hpp"
25
#include "Utilities/TaggedTuple.hpp"
26

27
namespace PhaseControl {
28

29
namespace Tags {
30
/// Storage in the phase change decision tuple so that the Main chare can record
31
/// the phase to go to when restarting the run from a checkpoint file.
32
///
33
/// \note This tag is not intended to participate in any of the reduction
34
/// procedures, so will error if the combine method is called.
35
struct RestartPhase {
36
  using type = std::optional<Parallel::Phase>;
37

38
  struct combine_method {
39
    [[noreturn]] std::optional<Parallel::Phase> operator()(
40
        const std::optional<Parallel::Phase> /*first_phase*/,
41
        const std::optional<Parallel::Phase>& /*second_phase*/);
42
  };
43

44
  using main_combine_method = combine_method;
45
};
46

47
/// Storage in the phase change decision tuple so that the Main chare can record
48
/// the elapsed wallclock time since the start of the run.
49
///
50
/// \note This tag is not intended to participate in any of the reduction
51
/// procedures, so will error if the combine method is called.
52
struct WallclockHoursAtCheckpoint {
53
  using type = std::optional<double>;
54

55
  struct combine_method {
56
    [[noreturn]] std::optional<double> operator()(
57
        const std::optional<double> /*first_time*/,
58
        const std::optional<double>& /*second_time*/);
59
  };
60
  using main_combine_method = combine_method;
61
};
62

63
/// Stores whether the checkpoint and exit has been requested.
64
///
65
/// Combinations are performed via `funcl::Or`, as the phase in question should
66
/// be chosen if any component requests the jump.
67
struct CheckpointAndExitRequested {
68
  using type = bool;
69

70
  using combine_method = funcl::Or<>;
71
  using main_combine_method = funcl::Or<>;
72
};
73

74
}  // namespace Tags
75

76
/*!
77
 * \brief Phase control object that runs the WriteCheckpoint and Exit phases
78
 * after a specified amount of wallclock time has elapsed.
79
 *
80
 * This phase control is useful for running SpECTRE executables performing
81
 * lengthy computations that may exceed a supercomputer's wallclock limits.
82
 * Writing a single checkpoint at the end of the job's allocated time allows
83
 * the computation to be continued, while minimizing the disc space taken up by
84
 * checkpoint files.
85
 *
86
 * Note that this phase control is not a trigger on wallclock time. Rather,
87
 * it checks the elapsed wallclock time when called, likely from a global sync
88
 * point triggered by some other mechanism, e.g., at some slab boundary.
89
 * Therefore, the WriteCheckpoint and Exit phases will run the first time
90
 * this phase control is called after the specified wallclock time has been
91
 * reached.
92
 *
93
 * \warning the global sync points _must_ be triggered often enough to ensure
94
 * there will be at least one sync point (i.e., one call to this phase control)
95
 * in the window between the requested checkpoint-and-exit time and the time at
96
 * which the batch system will kill the executable. To make this more concrete,
97
 * consider this example: when running on a 12-hour queue with a
98
 * checkpoint-and-exit requested after 11.5 hours, there is a 0.5-hour window
99
 * for a global sync to occur, the checkpoint files to be written to disc, and
100
 * the executable to clean up. In this case, triggering a global sync every
101
 * 2-10 minutes might be desirable. Matching the global sync frequency with the
102
 * time window for checkpoint and exit is the responsibility of the user!
103
 */
104
struct CheckpointAndExitAfterWallclock : public PhaseChange {
105
  CheckpointAndExitAfterWallclock(const std::optional<double> wallclock_hours,
106
                                  const Options::Context& context = {});
107

108
  explicit CheckpointAndExitAfterWallclock(CkMigrateMessage* msg);
109

110
  /// \cond
111
  CheckpointAndExitAfterWallclock() = default;
1✔
112
  using PUP::able::register_constructor;
113
  WRAPPED_PUPable_decl_template(CheckpointAndExitAfterWallclock);  // NOLINT
×
114
  /// \endcond
115

116
  struct WallclockHours {
117
    using type = Options::Auto<double, Options::AutoLabel::None>;
118
    static constexpr Options::String help = {
119
        "Time in hours after which to write the checkpoint and exit. "
120
        "If 'None' is specified, no action will be taken."};
121
  };
122

123
  using options = tmpl::list<WallclockHours>;
124
  static constexpr Options::String help{
125
      "Once the wallclock time has exceeded the specified amount, trigger "
126
      "writing a checkpoint and then exit."};
127

128
  using argument_tags = tmpl::list<>;
129
  using return_tags = tmpl::list<>;
130

131
  using phase_change_tags_and_combines =
132
      tmpl::list<Tags::RestartPhase, Tags::WallclockHoursAtCheckpoint,
133
                 Tags::CheckpointAndExitRequested>;
134

135
  template <typename Metavariables>
136
  using participating_components = typename Metavariables::component_list;
137

138
  template <typename... DecisionTags>
139
  void initialize_phase_data_impl(
140
      const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*>
141
          phase_change_decision_data) const;
142

143
  template <typename ParallelComponent, typename ArrayIndex,
144
            typename Metavariables>
145
  void contribute_phase_data_impl(Parallel::GlobalCache<Metavariables>& cache,
146
                                  const ArrayIndex& array_index) const;
147

148
  template <typename... DecisionTags, typename Metavariables>
149
  typename std::optional<std::pair<Parallel::Phase, ArbitrationStrategy>>
150
  arbitrate_phase_change_impl(
151
      const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*>
152
          phase_change_decision_data,
153
      const Parallel::Phase current_phase,
154
      const Parallel::GlobalCache<Metavariables>& /*cache*/) const;
155

156
  void pup(PUP::er& p) override;
157

158
 private:
159
  std::optional<double> wallclock_hours_for_checkpoint_and_exit_ = std::nullopt;
160
};
161

162
template <typename... DecisionTags>
163
void CheckpointAndExitAfterWallclock::initialize_phase_data_impl(
1✔
164
    const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*>
165
        phase_change_decision_data) const {
166
  tuples::get<Tags::RestartPhase>(*phase_change_decision_data) = std::nullopt;
1✔
167
  tuples::get<Tags::WallclockHoursAtCheckpoint>(*phase_change_decision_data) =
1✔
168
      std::nullopt;
169
  tuples::get<Tags::CheckpointAndExitRequested>(*phase_change_decision_data) =
1✔
170
      false;
171
}
1✔
172

173
template <typename ParallelComponent, typename ArrayIndex,
174
          typename Metavariables>
175
void CheckpointAndExitAfterWallclock::contribute_phase_data_impl(
176
    Parallel::GlobalCache<Metavariables>& cache,
177
    const ArrayIndex& array_index) const {
178
  if constexpr (std::is_same_v<typename ParallelComponent::chare_type,
179
                               Parallel::Algorithms::Array>) {
180
    Parallel::contribute_to_phase_change_reduction<ParallelComponent>(
181
        tuples::TaggedTuple<Tags::CheckpointAndExitRequested>{true}, cache,
182
        array_index);
183
  } else {
184
    Parallel::contribute_to_phase_change_reduction<ParallelComponent>(
185
        tuples::TaggedTuple<Tags::CheckpointAndExitRequested>{true}, cache);
186
  }
187
}
188

189
template <typename... DecisionTags, typename Metavariables>
190
typename std::optional<std::pair<Parallel::Phase, ArbitrationStrategy>>
191
CheckpointAndExitAfterWallclock::arbitrate_phase_change_impl(
4✔
192
    const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*>
193
        phase_change_decision_data,
194
    const Parallel::Phase current_phase,
195
    const Parallel::GlobalCache<Metavariables>& /*cache*/) const {
196
  // If no checkpoint-and-exit time given, then do nothing
197
  if (not wallclock_hours_for_checkpoint_and_exit_.has_value()) {
4✔
198
    return std::nullopt;
×
199
  }
200

201
  const double elapsed_hours = sys::wall_time() / 3600.0;
4✔
202

203
  auto& restart_phase =
204
      tuples::get<Tags::RestartPhase>(*phase_change_decision_data);
4✔
205
  auto& wallclock_hours_at_checkpoint =
206
      tuples::get<Tags::WallclockHoursAtCheckpoint>(
4✔
207
          *phase_change_decision_data);
208
  auto& exit_code =
209
      tuples::get<Parallel::Tags::ExitCode>(*phase_change_decision_data);
4✔
210
  if (restart_phase.has_value()) {
4✔
211
    ASSERT(wallclock_hours_at_checkpoint.has_value(),
2✔
212
           "Consistency error: Should have recorded the Wallclock time "
213
           "while recording a phase to restart from.");
214
    // This `if` branch, where restart_phase has a value, is the
215
    // post-checkpoint call to arbitrate_phase_change. Depending on the time
216
    // elapsed so far in this run, next phase is...
217
    // - Exit, if the time is large
218
    // - restart_phase, if the time is small
219
    if (elapsed_hours >= wallclock_hours_at_checkpoint.value()) {
2✔
220
      // Preserve restart_phase for use after restarting from the checkpoint
221
      exit_code = Parallel::ExitCode::ContinueFromCheckpoint;
1✔
222
      return std::make_pair(Parallel::Phase::Exit,
×
223
                            ArbitrationStrategy::RunPhaseImmediately);
1✔
224
    } else {
225
      // Reset restart_phase until it is needed for the next checkpoint
226
      const auto result = restart_phase;
1✔
227
      restart_phase.reset();
1✔
228
      wallclock_hours_at_checkpoint.reset();
1✔
229
      return std::make_pair(result.value(),
1✔
230
                            ArbitrationStrategy::PermitAdditionalJumps);
2✔
231
    }
232
  }
233

234
  auto& checkpoint_and_exit_requested =
235
      tuples::get<Tags::CheckpointAndExitRequested>(
2✔
236
          *phase_change_decision_data);
237
  if (checkpoint_and_exit_requested) {
2✔
238
    checkpoint_and_exit_requested = false;
2✔
239
    // We checked wallclock_hours_for_checkpoint_and_exit_ has value above
240
    if (elapsed_hours >= wallclock_hours_for_checkpoint_and_exit_.value()) {
2✔
241
      // Record phase and actual elapsed time for determining following phase
242
      restart_phase = current_phase;
1✔
243
      wallclock_hours_at_checkpoint = elapsed_hours;
1✔
244
      return std::make_pair(Parallel::Phase::WriteCheckpoint,
×
245
                            ArbitrationStrategy::RunPhaseImmediately);
1✔
246
    }
247
  }
248
  return std::nullopt;
1✔
249
}
250
}  // namespace PhaseControl
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc