• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

celerity / celerity-runtime / 12009876465

25 Nov 2024 12:19PM UTC coverage: 94.911%. Remained the same
12009876465

push

github

fknorr
[RM] fixup includes

3189 of 3626 branches covered (87.95%)

Branch coverage included in aggregate %.

7049 of 7161 relevant lines covered (98.44%)

1541661.11 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.23
/include/handler.h
1
#pragma once
2

3
#include "buffer.h"
4
#include "cgf_diagnostics.h"
5
#include "communicator.h"
6
#include "grid.h"
7
#include "hint.h"
8
#include "item.h"
9
#include "launcher.h"
10
#include "partition.h"
11
#include "range_mapper.h"
12
#include "ranges.h"
13
#include "reduction.h"
14
#include "sycl_wrappers.h"
15
#include "task.h"
16
#include "types.h"
17
#include "version.h"
18
#include "workaround.h"
19

20
#include <algorithm>
21
#include <cassert>
22
#include <cstddef>
23
#include <memory>
24
#include <optional>
25
#include <stdexcept>
26
#include <string>
27
#include <type_traits>
28
#include <typeinfo>
29
#include <utility>
30
#include <vector>
31

32
#include <fmt/format.h>
33
#include <sycl/sycl.hpp>
34

35

36
namespace celerity {
37
class handler;
38
}
39

40
namespace celerity::experimental {
41

42
/**
43
 * Constrains the granularity at which a task's global range can be split into chunks.
44
 *
45
 * In some situations an output buffer access is only guaranteed to write to non-overlapping subranges
46
 * if the task is split in a certain way. For example when computing the row-wise sum of a 2D matrix into
47
 * a 1D vector, a split constraint is required to ensure that each element of the vector is written by
48
 * exactly one chunk.
49
 *
50
 * Another use case is for performance optimization, for example when the creation of lots of small chunks
51
 * would result in hardware under-utilization and excessive data transfers.
52
 *
53
 * Since ND-range parallel_for kernels are already constrained to be split with group size granularity,
54
 * adding an additional constraint on top results in an effective constraint of LCM(group size, constraint).
55
 *
56
 * The constraint (or effective constraint) must evenly divide the global range.
57
 * This function has no effect when called for a task without a user-provided global range.
58
 */
59
template <int Dims>
60
void constrain_split(handler& cgh, const range<Dims>& constraint);
61

62
class collective_group;
63

64
} // namespace celerity::experimental
65

66
namespace celerity {
67

68
namespace detail {
69
        class task_manager;
70

71
        handler make_command_group_handler(const task_id tid, const size_t num_collective_nodes);
72
        std::unique_ptr<task> into_task(handler&& cgh);
73
        hydration_id add_requirement(handler& cgh, const buffer_id bid, const access_mode mode, std::unique_ptr<range_mapper_base> rm);
74
        void add_requirement(handler& cgh, const host_object_id hoid, const experimental::side_effect_order order, const bool is_void);
75
        void add_reduction(handler& cgh, const reduction_info& rinfo);
76

77
        void set_task_name(handler& cgh, const std::string& debug_name);
78

79
        struct unnamed_kernel {};
80

81
        template <typename KernelName>
82
        constexpr bool is_unnamed_kernel = std::is_same_v<KernelName, unnamed_kernel>;
83

84
        template <typename KernelName>
85
        std::string kernel_debug_name() {
3,128✔
86
                return !is_unnamed_kernel<KernelName> ? utils::get_simplified_type_name<KernelName>() : std::string{};
3,128✔
87
        }
88

89
        struct simple_kernel_flavor {};
90
        struct nd_range_kernel_flavor {};
91

92
        template <typename Flavor, int Dims>
93
        struct kernel_flavor_traits;
94

95
        struct no_local_size {};
96

97
        template <int Dims>
98
        struct kernel_flavor_traits<simple_kernel_flavor, Dims> {
99
                inline static constexpr bool has_local_size = false;
100
                using local_size_type = no_local_size;
101
        };
102

103
        template <int Dims>
104
        struct kernel_flavor_traits<nd_range_kernel_flavor, Dims> {
105
                inline static constexpr bool has_local_size = true;
106
                using local_size_type = range<Dims>;
107
        };
108

109
        class collective_tag_factory;
110
} // namespace detail
111

112
namespace experimental {
113
        /**
114
         * Each collective host task is executed within a collective group. If multiple host tasks are scheduled within the same collective group, they are
115
         * guaranteed to execute in the same order on every node and within a single thread per node. Each group has its own MPI communicator spanning all
116
         * participating nodes, so MPI operations the user invokes from different collective groups do not race.
117
         */
118
        class collective_group {
119
          public:
120
                /// Creates a new collective group with a globally unique id. This must only be called from the application thread.
121
                collective_group() noexcept : m_cgid(s_next_cgid++) {}
23✔
122

123
          private:
124
                friend class detail::collective_tag_factory;
125
                detail::collective_group_id m_cgid;
126
                inline static detail::collective_group_id s_next_cgid = detail::root_collective_group_id + 1;
127
        };
128
} // namespace experimental
129

130
namespace detail {
131
        /**
132
         * The collective group used in collective host tasks when no group is specified explicitly.
133
         */
134
        inline const experimental::collective_group default_collective_group;
135

136
        /**
137
         * Tag type marking a `handler::host_task` as a collective task. Do not construct this type directly, but use `celerity::experimental::collective`
138
         * or `celerity::experimental::collective(group)`.
139
         */
140
        class collective_tag {
141
          private:
142
                friend class collective_tag_factory;
143
                friend class celerity::handler;
144
                collective_tag(collective_group_id cgid) : m_cgid(cgid) {}
61✔
145
                collective_group_id m_cgid;
146
        };
147

148
        /**
149
         * Tag type construction helper. Do not construct this type directly, use `celerity::experimental::collective` instead.
150
         */
151
        class collective_tag_factory {
152
          public:
153
                operator collective_tag() const { return default_collective_group.m_cgid; }
28✔
154
                collective_tag operator()(experimental::collective_group cg) const { return cg.m_cgid; }
33✔
155
        };
156
} // namespace detail
157

158
namespace experimental {
159
        /**
160
         * Pass to `handler::host_task` to select the collective host task overload.
161
         *
162
         * Either as a value to schedule with the `default_collective_group`:
163
         * ```c++
164
         * cgh.host_task(celerity::experimental::collective, []...);
165
         * ```
166
         *
167
         * Or by specifying a collective group explicitly:
168
         * ```c++
169
         * celerity::experimental::collective_group my_group;
170
         * ...
171
         * cgh.host_task(celerity::experimental::collective(my_group), []...);
172
         * ```
173
         */
174
        inline constexpr detail::collective_tag_factory collective;
175
} // namespace experimental
176

177
namespace detail {
178

179
        /// Tag type marking a `handler::host_task` as a master-node task. Do not construct this type directly, but use `celerity::on_master_node`.
180
        class on_master_node_tag {};
181

182
        /// Tag type marking a `handler::host_task` as a single-item host task equivalent to `range<0>{}`. Do not construct this type directly, but use
183
        /// `celerity::once`.
184
        class once_tag {};
185

186
        template <typename Kernel, int Dims, typename... Reducers>
187
        inline void invoke_kernel(const Kernel& kernel, const sycl::id<std::max(1, Dims)>& s_id, const range<Dims>& global_range, const id<Dims>& global_offset,
16,804,204✔
188
            const id<Dims>& chunk_offset, Reducers&... reducers) {
189
                kernel(make_item<Dims>(id_cast<Dims>(id<std::max(1, Dims)>(s_id)) + chunk_offset, global_offset, global_range), reducers...);
16,804,204✔
190
        }
16,804,204✔
191

192
        template <typename Kernel, int Dims, typename... Reducers>
193
        inline void invoke_kernel(const Kernel& kernel, const sycl::nd_item<std::max(1, Dims)>& s_item, const range<Dims>& global_range,
25,330✔
194
            const id<Dims>& global_offset, const id<Dims>& chunk_offset, const range<Dims>& group_range, const id<Dims>& group_offset, Reducers&... reducers) {
195
                kernel(make_nd_item<Dims>(s_item, global_range, global_offset, chunk_offset, group_range, group_offset), reducers...);
25,330✔
196
        }
25,330✔
197

198
        template <typename Kernel, int Dims>
199
        auto bind_simple_kernel(const Kernel& kernel, const range<Dims>& global_range, const id<Dims>& global_offset, const id<Dims>& chunk_offset) {
366✔
200
                return [=](auto s_item_or_id, auto&... reducers) {
16,804,204✔
201
                        static_assert(std::is_invocable_v<Kernel, celerity::item<Dims>, decltype(reducers)...>,
202
                            "Kernel function must be invocable with celerity::item<Dims> and as many reducer objects as reductions passed to parallel_for");
203
                        if constexpr(CELERITY_WORKAROUND(DPCPP) && std::is_same_v<sycl::id<Dims>, decltype(s_item_or_id)>) {
204
                                // CELERITY_WORKAROUND_LESS_OR_EQUAL: DPC++ passes a sycl::id instead of a sycl::item to kernels alongside reductions
205
                                invoke_kernel(kernel, s_item_or_id, global_range, global_offset, chunk_offset, reducers...);
206
                        } else {
207
                                invoke_kernel(kernel, s_item_or_id.get_id(), global_range, global_offset, chunk_offset, reducers...);
16,804,204✔
208
                        }
209
                };
366✔
210
        }
211

212
        template <typename Kernel, int Dims>
213
        auto bind_nd_range_kernel(const Kernel& kernel, const range<Dims>& global_range, const id<Dims>& global_offset, const id<Dims> chunk_offset,
39✔
214
            const range<Dims>& group_range, const id<Dims>& group_offset) {
215
                return [=](sycl::nd_item<std::max(1, Dims)> s_item, auto&... reducers) {
50,664✔
216
                        static_assert(std::is_invocable_v<Kernel, celerity::nd_item<Dims>, decltype(reducers)...>,
217
                            "Kernel function must be invocable with celerity::nd_item<Dims> or and as many reducer objects as reductions passed to parallel_for");
218
                        invoke_kernel(kernel, s_item, global_range, global_offset, chunk_offset, group_range, group_offset, reducers...);
25,330✔
219
                };
39✔
220
        }
221

222
        template <typename KernelName, typename... Params>
223
        inline void invoke_sycl_parallel_for(sycl::handler& cgh, Params&&... args) {
403✔
224
                static_assert(CELERITY_FEATURE_UNNAMED_KERNELS || !is_unnamed_kernel<KernelName>,
225
                    "Your SYCL implementation does not support unnamed kernels, add a kernel name template parameter to this parallel_for invocation");
226
                if constexpr(detail::is_unnamed_kernel<KernelName>) {
227
#if CELERITY_FEATURE_UNNAMED_KERNELS // see static_assert above
228
                        cgh.parallel_for(std::forward<Params>(args)...);
216✔
229
#endif
230
                } else {
231
                        cgh.parallel_for<KernelName>(std::forward<Params>(args)...);
187✔
232
                }
233
        }
403✔
234

235
        template <typename DataT, int Dims, typename BinaryOperation, bool WithExplicitIdentity>
236
        class reduction_descriptor;
237

238
        template <typename DataT, int Dims, typename BinaryOperation, bool WithExplicitIdentity>
239
        auto make_sycl_reduction(const reduction_descriptor<DataT, Dims, BinaryOperation, WithExplicitIdentity>& d, void* ptr) {
77✔
240
                if constexpr(WithExplicitIdentity) {
241
                        return sycl::reduction(static_cast<DataT*>(ptr), d.m_identity, d.m_op, sycl::property_list{sycl::property::reduction::initialize_to_identity{}});
16✔
242
                } else {
243
                        return sycl::reduction(static_cast<DataT*>(ptr), d.m_op, sycl::property_list{sycl::property::reduction::initialize_to_identity{}});
138✔
244
                }
245
        }
246

247
        template <typename DataT, int Dims, typename BinaryOperation>
248
        class reduction_descriptor<DataT, Dims, BinaryOperation, false /* WithExplicitIdentity */> {
249
          public:
250
                reduction_descriptor(buffer_id bid, BinaryOperation combiner, DataT /* identity */, bool include_current_buffer_value)
45✔
251
                    : m_bid(bid), m_op(combiner), m_include_current_buffer_value(include_current_buffer_value) {}
45✔
252

253
          private:
254
                friend auto make_sycl_reduction<DataT, Dims, BinaryOperation, false>(const reduction_descriptor&, void*);
255

256
                buffer_id m_bid;
257
                BinaryOperation m_op;
258
                bool m_include_current_buffer_value;
259
        };
260

261
        template <typename DataT, int Dims, typename BinaryOperation>
262
        class reduction_descriptor<DataT, Dims, BinaryOperation, true /* WithExplicitIdentity */> {
263
          public:
264
                reduction_descriptor(buffer_id bid, BinaryOperation combiner, DataT identity, bool include_current_buffer_value)
6✔
265
                    : m_bid(bid), m_op(combiner), m_identity(identity), m_include_current_buffer_value(include_current_buffer_value) {}
6✔
266

267
          private:
268
                friend auto make_sycl_reduction<DataT, Dims, BinaryOperation, true>(const reduction_descriptor&, void*);
269

270
                buffer_id m_bid;
271
                BinaryOperation m_op;
272
                DataT m_identity{};
273
                bool m_include_current_buffer_value;
274
        };
275

276
        template <bool WithExplicitIdentity, typename DataT, int Dims, typename BinaryOperation>
277
        auto make_reduction(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation op, DataT identity, const sycl::property_list& prop_list) {
54✔
278
                if(vars.get_range().size() != 1) {
54✔
279
                        // Like SYCL 2020, Celerity only supports reductions to unit-sized buffers. This allows us to avoid tracking different parts of the buffer
280
                        // as distributed_state and pending_reduction_state.
281
                        throw std::runtime_error("Only unit-sized buffers can be reduction targets");
3✔
282
                }
283

284
                const auto bid = detail::get_buffer_id(vars);
51✔
285
                const auto include_current_buffer_value = !prop_list.has_property<celerity::property::reduction::initialize_to_identity>();
51✔
286

287
                const auto rid = detail::runtime::get_instance().create_reduction(detail::make_reducer(op, identity));
51✔
288
                add_reduction(cgh, reduction_info{rid, bid, include_current_buffer_value});
51✔
289

290
                return detail::reduction_descriptor<DataT, Dims, BinaryOperation, WithExplicitIdentity>{bid, op, identity, include_current_buffer_value};
102✔
291
        }
292

293
        template <typename... ReductionsAndKernel>
294
        struct is_reductions_and_kernel_object : std::false_type {};
295

296
        template <typename Kernel>
297
        struct is_reductions_and_kernel_object<Kernel> : std::true_type {};
298

299
        template <bool WithExplicitIdentity, typename DataT, int Dims, typename BinaryOperation, typename... ReductionsAndKernel>
300
        struct is_reductions_and_kernel_object<reduction_descriptor<DataT, Dims, BinaryOperation, WithExplicitIdentity>, ReductionsAndKernel...>
301
            : is_reductions_and_kernel_object<ReductionsAndKernel...> {};
302

303
        template <typename... KernelAndReductions>
304
        constexpr bool is_reductions_and_kernel_v = is_reductions_and_kernel_object<std::remove_cv_t<std::remove_reference_t<KernelAndReductions>>...>::value;
305

306
} // namespace detail
307

308
/// Pass to `handler::host_task` to select the master-node task overload.
309
inline constexpr detail::on_master_node_tag on_master_node;
310

311
/// Equivalent to `range<0>{}` when passed to `handler::host_task`.
312
inline constexpr detail::once_tag once;
313

314
class handler {
315
  public:
316
        template <typename KernelName = detail::unnamed_kernel, int Dims, typename... ReductionsAndKernel,
317
            std::enable_if_t<detail::is_reductions_and_kernel_v<ReductionsAndKernel...>, int> = 0>
318
        void parallel_for(range<Dims> global_range, ReductionsAndKernel&&... reductions_and_kernel) {
251✔
319
                parallel_for_reductions_and_kernel<detail::simple_kernel_flavor, KernelName, Dims, ReductionsAndKernel...>(global_range, id<Dims>(),
343✔
320
                    detail::no_local_size{}, std::make_index_sequence<sizeof...(reductions_and_kernel) - 1>{},
321
                    std::forward<ReductionsAndKernel>(reductions_and_kernel)...);
322
        }
238✔
323

324
        template <typename KernelName = detail::unnamed_kernel, int Dims, typename... ReductionsAndKernel,
325
            std::enable_if_t<detail::is_reductions_and_kernel_v<ReductionsAndKernel...>, int> = 0>
326
        void parallel_for(range<Dims> global_range, id<Dims> global_offset, ReductionsAndKernel&&... reductions_and_kernel) {
1,046✔
327
                parallel_for_reductions_and_kernel<detail::simple_kernel_flavor, KernelName, Dims, ReductionsAndKernel...>(global_range, global_offset,
1,046✔
328
                    detail::no_local_size{}, std::make_index_sequence<sizeof...(reductions_and_kernel) - 1>{},
329
                    std::forward<ReductionsAndKernel>(reductions_and_kernel)...);
330
        }
1,046✔
331

332
        template <typename KernelName = detail::unnamed_kernel, typename... ReductionsAndKernel,
333
            std::enable_if_t<detail::is_reductions_and_kernel_v<ReductionsAndKernel...>, int> = 0>
334
        void parallel_for(const size_t global_range, ReductionsAndKernel&&... reductions_and_kernel) {
61✔
335
                parallel_for<KernelName>(range<1>(global_range), std::forward<ReductionsAndKernel>(reductions_and_kernel)...);
61✔
336
        }
61✔
337

338
        template <typename KernelName = detail::unnamed_kernel, typename... ReductionsAndKernel,
339
            std::enable_if_t<detail::is_reductions_and_kernel_v<ReductionsAndKernel...>, int> = 0>
340
        void parallel_for(const size_t global_range, const size_t global_offset, ReductionsAndKernel&&... reductions_and_kernel) {
1✔
341
                parallel_for<KernelName>(range<1>(global_range), id<1>(global_offset), std::forward<ReductionsAndKernel>(reductions_and_kernel)...);
1✔
342
        }
1✔
343

344
        template <typename KernelName = detail::unnamed_kernel, int Dims, typename... ReductionsAndKernel,
345
            std::enable_if_t<detail::is_reductions_and_kernel_v<ReductionsAndKernel...>, int> = 0>
346
        void parallel_for(celerity::nd_range<Dims> execution_range, ReductionsAndKernel&&... reductions_and_kernel) {
48✔
347
                parallel_for_reductions_and_kernel<detail::nd_range_kernel_flavor, KernelName, Dims, ReductionsAndKernel...>(execution_range.get_global_range(),
96✔
348
                    execution_range.get_offset(), execution_range.get_local_range(), std::make_index_sequence<sizeof...(reductions_and_kernel) - 1>{},
48✔
349
                    std::forward<ReductionsAndKernel>(reductions_and_kernel)...);
350
        }
45✔
351

352
        /**
353
         * Schedules `task` to execute on the master node only. Call via `cgh.host_task(celerity::on_master_node, []...)`. The task functor is assumed to be
354
         * invocable with the signature `void(const celerity::partition<0> &)` or `void()`.
355
         *
356
         * The task is executed in a background thread and multiple master node tasks may be executed concurrently if they are independent in the task graph, so
357
         * proper synchronization must be ensured.
358
         */
359
        template <typename Functor>
360
        void host_task(const detail::on_master_node_tag on_master_node, Functor&& task) {
1,266✔
361
                auto launcher = make_host_task_launcher<0, false>(detail::zeros, detail::non_collective_group_id, std::forward<Functor>(task));
1,266✔
362
                create_master_node_task(std::move(launcher));
1,263✔
363
        }
2,526✔
364
        /**
365
         * Schedules `task` to be executed collectively on all nodes participating in the specified collective group. Call via
366
         * `cgh.host_task(celerity::experimental::collective, []...)` or  `cgh.host_task(celerity::experimental::collective(group), []...)`.
367
         * The task functor is assumed to be invocable with the signature `void(const celerity::experimental::collective_partition&)`
368
         * or `void(const celerity::partition<1>&)`.
369
         *
370
         * This provides framework to use arbitrary collective MPI operations in a host task, such as performing collective I/O with parallel HDF5.
371
         * The local node id,t the number of participating nodes as well as the group MPI communicator can be obtained from the `collective_partition` passed into
372
         * the task functor.
373
         *
374
         * All collective tasks within a collective group are guaranteed to be executed in the same order on all nodes.
375
         */
376
        template <typename Functor>
377
        void host_task(const detail::collective_tag collective, Functor&& task) {
61✔
378
                // FIXME: We should not have to know how the global range is determined for collective tasks to create the launcher
379
                auto launcher = make_host_task_launcher<1, true>(range<3>{m_num_collective_nodes, 1, 1}, collective.m_cgid, std::forward<Functor>(task));
61✔
380
                create_collective_task(collective.m_cgid, std::move(launcher));
61✔
381
        }
122✔
382

383
        /**
384
         * Schedules a distributed execution of `task` by splitting the iteration space in a runtime-defined manner. The task functor is assumed to be invocable
385
         * with the signature `void(const celerity::partition<Dims>&)`.
386
         *
387
         * The task is executed in a background thread with multiple host tasks being run concurrently if they are independent in the task graph, so proper
388
         * synchronization must be ensured. The partition passed into the task functor describes the split each host receives. It may be used with accessors to
389
         * obtain the per-node portion of a buffer en-bloc, see `celerity::accessor::get_allocation_window` for details.
390
         *
391
         * There are no guarantees with respect to the split size and the order in which host tasks are reordered between nodes other than
392
         * the restrictions imposed by dependencies in the task graph. Also, the task functor may be invoked multiple times on one node and not be scheduled on
393
         * another node. If you need guarantees about execution order, consider `host_task(experimental::collective)` instead.
394
         */
395
        template <int Dims, typename Functor>
396
        void host_task(range<Dims> global_range, id<Dims> global_offset, Functor&& task) {
336✔
397
                const detail::task_geometry geometry{
672✔
398
                    Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), get_constrained_granularity(global_range, range<Dims>(detail::ones))};
672✔
399
                auto launcher = make_host_task_launcher<Dims, false>(detail::range_cast<3>(global_range), 0, std::forward<Functor>(task));
336✔
400
                create_host_compute_task(geometry, std::move(launcher));
336✔
401
        }
672✔
402

403
        /**
404
         * Like `host_task(range<Dims> global_range, id<Dims> global_offset, Functor task)`, but with a `global_offset` of zero.
405
         */
406
        template <int Dims, typename Functor>
407
        void host_task(range<Dims> global_range, Functor&& task) {
333✔
408
                host_task(global_range, {}, std::forward<Functor>(task));
349✔
409
        }
333✔
410

411
        /**
412
         * Schedules a host task with a single-element iteration space, causing it to be executed exactly once and on a single cluster node.
413
         * Equivalent to `host_task(range<0>{}, ...)`.
414
         */
415
        template <typename Functor>
416
        void host_task(const detail::once_tag once, Functor&& task) {
263✔
417
                host_task(range<0>{}, std::forward<Functor>(task));
263✔
418
        }
263✔
419

420
  private:
421
        friend handler detail::make_command_group_handler(const detail::task_id tid, const size_t num_collective_nodes);
422
        friend std::unique_ptr<detail::task> detail::into_task(handler&& cgh);
423
        friend detail::hydration_id detail::add_requirement(
424
            handler& cgh, const detail::buffer_id bid, const access_mode mode, std::unique_ptr<detail::range_mapper_base> rm);
425
        friend void detail::add_requirement(handler& cgh, const detail::host_object_id hoid, const experimental::side_effect_order order, const bool is_void);
426
        friend void detail::add_reduction(handler& cgh, const detail::reduction_info& rinfo);
427
        template <int Dims>
428
        friend void experimental::constrain_split(handler& cgh, const range<Dims>& constraint);
429
        template <typename Hint>
430
        friend void experimental::hint(handler& cgh, Hint&& hint);
431
        friend void detail::set_task_name(handler& cgh, const std::string& debug_name);
432

433
        detail::task_id m_tid;
434
        std::vector<detail::buffer_access> m_buffer_accesses;
435
        detail::side_effect_map m_side_effects;
436
        size_t m_non_void_side_effects_count = 0;
437
        detail::reduction_set m_reductions;
438
        std::unique_ptr<detail::task> m_task = nullptr;
439
        size_t m_num_collective_nodes;
440
        detail::hydration_id m_next_accessor_hydration_id = 1;
441
        std::optional<std::string> m_usr_def_task_name;
442
        range<3> m_split_constraint = detail::ones;
443
        std::vector<std::unique_ptr<detail::hint_base>> m_hints;
444

445
        handler(detail::task_id tid, size_t num_collective_nodes) : m_tid(tid), m_num_collective_nodes(num_collective_nodes) {}
3,015✔
446

447
        template <typename KernelFlavor, typename KernelName, int Dims, typename... ReductionsAndKernel, size_t... ReductionIndices>
448
        void parallel_for_reductions_and_kernel(range<Dims> global_range, id<Dims> global_offset,
1,345✔
449
            typename detail::kernel_flavor_traits<KernelFlavor, Dims>::local_size_type local_size, std::index_sequence<ReductionIndices...> indices,
450
            ReductionsAndKernel&&... kernel_and_reductions) {
451
                auto args_tuple = std::forward_as_tuple(kernel_and_reductions...);
1,345✔
452
                auto&& kernel = std::get<sizeof...(kernel_and_reductions) - 1>(args_tuple);
1,345✔
453
                parallel_for_kernel_and_reductions<KernelFlavor, KernelName>(
1,345✔
454
                    global_range, global_offset, local_size, std::forward<decltype(kernel)>(kernel), std::get<ReductionIndices>(args_tuple)...);
455
        }
1,329✔
456

457
        template <typename KernelFlavor, typename KernelName, int Dims, typename Kernel, typename... Reductions>
458
        void parallel_for_kernel_and_reductions(range<Dims> global_range, id<Dims> global_offset,
1,345✔
459
            typename detail::kernel_flavor_traits<KernelFlavor, Dims>::local_size_type local_range, Kernel&& kernel, Reductions&... reductions) {
460
                range<3> granularity = {1, 1, 1};
1,345✔
461
                if constexpr(detail::kernel_flavor_traits<KernelFlavor, Dims>::has_local_size) {
462
                        for(int d = 0; d < Dims; ++d) {
133✔
463
                                granularity[d] = local_range[d];
85✔
464
                        }
465
                }
466
                const detail::task_geometry geometry{Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset),
1,345✔
467
                    get_constrained_granularity(global_range, detail::range_cast<Dims>(granularity))};
1,345✔
468
                auto launcher = make_device_kernel_launcher<KernelFlavor, KernelName, Dims>(
1,335✔
469
                    global_range, global_offset, local_range, std::forward<Kernel>(kernel), std::index_sequence_for<Reductions...>(), reductions...);
470
                create_device_compute_task(geometry, detail::kernel_debug_name<KernelName>(), std::move(launcher));
1,337✔
471
        }
2,660✔
472

473
        [[nodiscard]] detail::hydration_id add_requirement(const detail::buffer_id bid, const access_mode mode, std::unique_ptr<detail::range_mapper_base> rm) {
2,921✔
474
                assert(m_task == nullptr);
2,921✔
475
                m_buffer_accesses.push_back(detail::buffer_access{bid, mode, std::move(rm)});
2,921✔
476
                return m_next_accessor_hydration_id++;
2,921✔
477
        }
478

479
        void add_requirement(const detail::host_object_id hoid, const experimental::side_effect_order order, const bool is_void) {
73✔
480
                assert(m_task == nullptr);
73✔
481
                m_side_effects.add_side_effect(hoid, order);
73✔
482
                if(!is_void) { m_non_void_side_effects_count++; }
73✔
483
        }
73✔
484

485
        void add_reduction(const detail::reduction_info& rinfo) {
125✔
486
                assert(m_task == nullptr);
125✔
487
                m_reductions.push_back(rinfo);
125✔
488
        }
125✔
489

490
        template <int Dims>
491
        void experimental_constrain_split(const range<Dims>& constraint) {
18✔
492
                assert(m_task == nullptr);
18✔
493
                m_split_constraint = detail::range_cast<3>(constraint);
18✔
494
        }
18✔
495

496
        template <typename Hint>
497
        void experimental_hint(Hint&& hint) {
151✔
498
                static_assert(std::is_base_of_v<detail::hint_base, std::decay_t<Hint>>, "Hint must extend hint_base");
499
                static_assert(std::is_move_constructible_v<Hint>, "Hint must be move-constructible");
500
                for(auto& h : m_hints) {
184✔
501
                        // We currently don't allow more than one hint of the same type for simplicity; this could be loosened in the future.
502
                        auto& hr = *h; // Need to do this here to avoid -Wpotentially-evaluated-expression
37✔
503
                        if(typeid(hr) == typeid(hint)) { throw std::runtime_error("Providing more than one hint of the same type is not allowed"); }
37✔
504
                        h->validate(hint);
36✔
505
                }
506
                m_hints.emplace_back(std::make_unique<std::decay_t<Hint>>(std::forward<Hint>(hint)));
147✔
507
        }
147✔
508

509
        template <int Dims>
510
        range<3> get_constrained_granularity(const range<Dims>& global_size, const range<Dims>& granularity) const {
1,681✔
511
                range<3> result = detail::range_cast<3>(granularity);
1,681✔
512
                for(int i = 0; i < Dims; ++i) {
3,474✔
513
                        const auto lcm = std::lcm(granularity[i], m_split_constraint[i]);
1,796✔
514
                        if(lcm == 0) { throw std::runtime_error("Split constraint cannot be 0"); }
1,796✔
515
                        result[i] = lcm;
1,793✔
516
                }
517
                if(global_size % detail::range_cast<Dims>(result) != range<Dims>(detail::zeros)) {
1,678✔
518
                        throw std::runtime_error(fmt::format("The{}split constraint {} does not evenly divide the kernel global size {}",
28✔
519
                            granularity.size() > 1 ? " effective " : " ", detail::range_cast<Dims>(result), global_size));
21✔
520
                }
521
                return result;
1,671✔
522
        }
523

524
        void create_host_compute_task(const detail::task_geometry& geometry, detail::host_task_launcher launcher) {
336✔
525
                assert(m_task == nullptr);
336✔
526
                if(geometry.global_size.size() == 0) {
336!
527
                        // TODO this can be easily supported by not creating a task in case the execution range is empty
528
                        throw std::runtime_error{"The execution range of distributed host tasks must have at least one item"};
×
529
                }
530
                m_task = detail::task::make_host_compute(m_tid, geometry, std::move(launcher), detail::buffer_access_map(std::move(m_buffer_accesses), geometry),
1,344✔
531
                    std::move(m_side_effects), std::move(m_reductions));
1,008✔
532

533
                m_task->set_debug_name(m_usr_def_task_name.value_or(""));
336✔
534
        }
336✔
535

536
        void create_device_compute_task(const detail::task_geometry& geometry, const std::string& debug_name, detail::device_kernel_launcher launcher) {
1,331✔
537
                assert(m_task == nullptr);
1,331✔
538
                if(geometry.global_size.size() == 0) {
1,331!
539
                        // TODO unless reductions are involved, this can be easily supported by not creating a task in case the execution range is empty.
540
                        // Edge case: If the task includes reductions that specify property::reduction::initialize_to_identity, we need to create a task that sets
541
                        // the buffer state to an empty pending_reduction_state in the graph_generator. This will cause a trivial reduction_command to be generated on
542
                        // each node that reads from the reduction output buffer, initializing it to the identity value locally.
543
                        throw std::runtime_error{"The execution range of device tasks must have at least one item"};
×
544
                }
545
                // Note that cgf_diagnostics has a similar check, but we don't catch void side effects there.
546
                if(!m_side_effects.empty()) { throw std::runtime_error{"Side effects cannot be used in device kernels"}; }
1,331!
547
                m_task = detail::task::make_device_compute(
7,984✔
548
                    m_tid, geometry, std::move(launcher), detail::buffer_access_map(std::move(m_buffer_accesses), geometry), std::move(m_reductions));
6,649✔
549

550
                m_task->set_debug_name(m_usr_def_task_name.value_or(debug_name));
1,329✔
551
        }
1,329✔
552

553
        void create_collective_task(const detail::collective_group_id cgid, detail::host_task_launcher launcher) {
61✔
554
                assert(m_task == nullptr);
61✔
555
                const detail::task_geometry geometry{1, detail::range_cast<3>(range(m_num_collective_nodes)), {}, {1, 1, 1}};
61✔
556
                m_task = detail::task::make_collective(m_tid, geometry, cgid, m_num_collective_nodes, std::move(launcher),
305✔
557
                    detail::buffer_access_map(std::move(m_buffer_accesses), geometry), std::move(m_side_effects));
244✔
558

559
                m_task->set_debug_name(m_usr_def_task_name.value_or(""));
61✔
560
        }
61✔
561

562
        void create_master_node_task(detail::host_task_launcher launcher) {
1,263✔
563
                assert(m_task == nullptr);
1,263✔
564
                m_task = detail::task::make_master_node(
7,578✔
565
                    m_tid, std::move(launcher), detail::buffer_access_map(std::move(m_buffer_accesses), detail::task_geometry{}), std::move(m_side_effects));
6,315✔
566
                m_task->set_debug_name(m_usr_def_task_name.value_or(""));
1,263✔
567
        }
1,263✔
568

569
        template <typename KernelFlavor, typename KernelName, int Dims, typename Kernel, size_t... ReductionIndices, typename... Reductions>
570
        detail::device_kernel_launcher make_device_kernel_launcher(const range<Dims>& global_range, const id<Dims>& global_offset,
1,335✔
571
            typename detail::kernel_flavor_traits<KernelFlavor, Dims>::local_size_type local_range, Kernel&& kernel,
572
            std::index_sequence<ReductionIndices...> /* indices */, Reductions... reductions) {
573
                static_assert(std::is_copy_constructible_v<std::decay_t<Kernel>>, "Kernel functor must be copyable"); // Required for hydration
574

575
                // Check whether all accessors are being captured by value etc.
576
                // Although the diagnostics should always be available, we currently disable them for some test cases.
577
                if(detail::cgf_diagnostics::is_available()) { detail::cgf_diagnostics::get_instance().check<target::device>(kernel, m_buffer_accesses); }
1,335✔
578

579
                return [=](sycl::handler& sycl_cgh, const detail::box<3>& execution_range, const std::vector<void*>& reduction_ptrs) {
1,954!
580
                        constexpr int sycl_dims = std::max(1, Dims);
403✔
581
                        if constexpr(std::is_same_v<KernelFlavor, detail::simple_kernel_flavor>) {
582
                                const auto sycl_global_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(execution_range.get_range()));
364✔
583
                                detail::invoke_sycl_parallel_for<KernelName>(sycl_cgh, sycl_global_range,
1,092✔
584
                                    detail::make_sycl_reduction(reductions, reduction_ptrs[ReductionIndices])...,
71✔
585
                                    detail::bind_simple_kernel(kernel, global_range, global_offset, detail::id_cast<Dims>(execution_range.get_offset())));
728✔
586
                        } else if constexpr(std::is_same_v<KernelFlavor, detail::nd_range_kernel_flavor>) {
587
                                const auto sycl_global_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(execution_range.get_range()));
39✔
588
                                const auto sycl_local_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(local_range));
39✔
589
                                detail::invoke_sycl_parallel_for<KernelName>(sycl_cgh, sycl::nd_range{sycl_global_range, sycl_local_range},
170✔
590
                                    detail::make_sycl_reduction(reductions, reduction_ptrs[ReductionIndices])...,
6✔
591
                                    detail::bind_nd_range_kernel(kernel, global_range, global_offset, detail::id_cast<Dims>(execution_range.get_offset()),
69✔
592
                                        global_range / local_range, detail::id_cast<Dims>(execution_range.get_offset()) / local_range));
117✔
593
                        } else {
594
                                static_assert(detail::constexpr_false<KernelFlavor>);
595
                        }
596
                };
2,880✔
597
        }
598

599
        template <int Dims, bool Collective, typename Kernel>
600
        detail::host_task_launcher make_host_task_launcher(const range<3>& global_range, const detail::collective_group_id cgid, Kernel&& kernel) {
1,663✔
601
                static_assert(Collective || std::is_invocable_v<Kernel> || std::is_invocable_v<Kernel, const partition<Dims>>,
602
                    "Kernel for host task must be invocable with either no arguments or a celerity::partition<Dims>");
603
                static_assert(!Collective || std::is_invocable_v<Kernel> || std::is_invocable_v<Kernel, const experimental::collective_partition>,
604
                    "Kernel for collective host task must be invocable with either no arguments or a celerity::experimental::collective_partition");
605
                static_assert(std::is_copy_constructible_v<std::decay_t<Kernel>>, "Kernel functor must be copyable"); // Required for hydration
606
                static_assert(Dims >= 0);
607

608
                // Check whether all accessors are being captured by value etc.
609
                // Although the diagnostics should always be available, we currently disable them for some test cases.
610
                if(detail::cgf_diagnostics::is_available()) {
1,663✔
611
                        detail::cgf_diagnostics::get_instance().check<target::host_task>(kernel, m_buffer_accesses, m_non_void_side_effects_count);
1,148✔
612
                }
613

614
                return [kernel, global_range](const detail::box<3>& execution_range, const detail::communicator* collective_comm) {
4,947✔
615
                        (void)global_range;
616
                        (void)collective_comm;
617
                        if constexpr(Dims > 0) {
618
                                if constexpr(Collective) {
619
                                        static_assert(Dims == 1);
620
                                        assert(collective_comm != nullptr);
26✔
621
                                        const auto part =
52✔
622
                                            detail::make_collective_partition(detail::range_cast<1>(global_range), detail::box_cast<1>(execution_range), *collective_comm);
52✔
623
                                        kernel(part);
26✔
624
                                } else {
625
                                        const auto part = detail::make_partition<Dims>(detail::range_cast<Dims>(global_range), detail::box_cast<Dims>(execution_range));
14✔
626
                                        kernel(part);
14✔
627
                                }
628
                        } else if constexpr(std::is_invocable_v<Kernel, const partition<0>&>) {
629
                                (void)execution_range;
630
                                const auto part = detail::make_partition<0>(range<0>(), subrange<0>());
3✔
631
                                kernel(part);
3✔
632
                        } else {
633
                                (void)execution_range;
634
                                kernel();
1,056✔
635
                        }
636
                };
4,454✔
637
        }
638

639
        std::unique_ptr<detail::task> into_task() && {
2,980✔
640
                assert(m_task != nullptr);
2,980✔
641
                for(auto& h : m_hints) {
3,122✔
642
                        m_task->add_hint(std::move(h));
142✔
643
                }
644
                return std::move(m_task);
2,980✔
645
        }
646
};
647

648
namespace detail {
649

650
        inline handler make_command_group_handler(const detail::task_id tid, const size_t num_collective_nodes) { return handler(tid, num_collective_nodes); }
3,015✔
651

652
        inline std::unique_ptr<detail::task> into_task(handler&& cgh) { return std::move(cgh).into_task(); }
2,980✔
653

654
        template <typename CGF>
655
        std::unique_ptr<task> invoke_command_group_function(const task_id tid, size_t num_collective_nodes, CGF&& cgf) {
2,987✔
656
                handler cgh = make_command_group_handler(tid, num_collective_nodes);
2,987✔
657
                std::invoke(std::forward<CGF>(cgf), cgh);
2,987✔
658
                return into_task(std::move(cgh));
5,950✔
659
        }
2,987✔
660

661
        [[nodiscard]] inline hydration_id add_requirement(handler& cgh, const buffer_id bid, const access_mode mode, std::unique_ptr<range_mapper_base> rm) {
2,921✔
662
                return cgh.add_requirement(bid, mode, std::move(rm));
5,842✔
663
        }
664

665
        inline void add_requirement(handler& cgh, const host_object_id hoid, const experimental::side_effect_order order, const bool is_void) {
73✔
666
                return cgh.add_requirement(hoid, order, is_void);
73✔
667
        }
668

669
        inline void add_reduction(handler& cgh, const detail::reduction_info& rinfo) { return cgh.add_reduction(rinfo); }
125✔
670

671
        inline void set_task_name(handler& cgh, const std::string& debug_name) { cgh.m_usr_def_task_name = {debug_name}; }
357✔
672

673
        // TODO: The _impl functions in detail only exist during the grace period for deprecated reductions on const buffers; move outside again afterwards.
674
        template <typename DataT, int Dims, typename BinaryOperation>
675
        auto reduction_impl(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation combiner, const sycl::property_list& prop_list = {}) {
48✔
676
                static_assert(sycl::has_known_identity_v<BinaryOperation, DataT>,
677
                    "Celerity does not currently support reductions without an identity. Either specialize "
678
                    "sycl::known_identity or use the reduction() overload taking an identity at runtime");
679
                return detail::make_reduction<false>(vars, cgh, combiner, sycl::known_identity_v<BinaryOperation, DataT>, prop_list);
48✔
680
        }
681

682
        template <typename DataT, int Dims, typename BinaryOperation>
683
        auto reduction_impl(
6✔
684
            const buffer<DataT, Dims>& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const sycl::property_list& prop_list = {}) {
685
                static_assert(!sycl::has_known_identity_v<BinaryOperation, DataT>, "Identity is known to SYCL, remove the identity parameter from reduction()");
686
                return detail::make_reduction<true>(vars, cgh, combiner, identity, prop_list);
6✔
687
        }
688

689
} // namespace detail
690

691
template <typename DataT, int Dims, typename BinaryOperation>
692
auto reduction(buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation combiner, const sycl::property_list& prop_list = {}) {
47✔
693
        return detail::reduction_impl(vars, cgh, combiner, prop_list);
47✔
694
}
695

696
template <typename DataT, int Dims, typename BinaryOperation>
697
auto reduction(buffer<DataT, Dims>& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const sycl::property_list& prop_list = {}) {
6✔
698
        return detail::reduction_impl(vars, cgh, identity, combiner, prop_list);
6✔
699
}
700

701
template <typename DataT, int Dims, typename BinaryOperation>
702
[[deprecated("Creating reduction from const buffer is deprecated, capture buffer by reference instead")]] auto reduction(
1✔
703
    const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation combiner, const sycl::property_list& prop_list = {}) {
704
        return detail::reduction_impl(vars, cgh, combiner, prop_list);
1✔
705
}
706

707
template <typename DataT, int Dims, typename BinaryOperation>
708
[[deprecated("Creating reduction from const buffer is deprecated, capture buffer by reference instead")]] auto reduction(
709
    const buffer<DataT, Dims>& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const sycl::property_list& prop_list = {}) {
710
        return detail::reduction_impl(vars, cgh, identity, combiner, prop_list);
711
}
712

713
} // namespace celerity
714

715
namespace celerity::experimental {
716
template <int Dims>
717
void constrain_split(handler& cgh, const range<Dims>& constraint) {
18✔
718
        cgh.experimental_constrain_split(constraint);
18✔
719
}
18✔
720

721
template <typename Hint>
722
void hint(handler& cgh, Hint&& hint) {
151✔
723
        cgh.experimental_hint(std::forward<Hint>(hint));
151✔
724
}
147✔
725
} // namespace celerity::experimental
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc